From 9d06a1fbc059656923e7213c29fab592195d6c10 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 22 May 2017 14:43:07 -0400
Subject: [PATCH 01/33] #1342: Allow use of truncated `Dictionary` for
 coherence calculation by avoiding lookup of tokens not in the topic token
 lists.

---
 gensim/models/coherencemodel.py               |  49 ++++---
 gensim/test/test_probability_estimation.py    |  87 ++++++++++--
 .../topic_coherence/probability_estimation.py | 125 ++++++++++++------
 3 files changed, 191 insertions(+), 70 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 161d0257a4..130c285822 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -19,6 +19,7 @@
 """
 
 import logging
+import multiprocessing as mp
 
 from gensim import interfaces
 from gensim.topic_coherence import (segmentation, probability_estimation,
@@ -89,7 +90,8 @@ class CoherenceModel(interfaces.TransformationABC):
 
     Model persistency is achieved via its load/save methods.
     """
-    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10):
+    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None,
+                 coherence='c_v', topn=10):
         """
         Args:
         ----
@@ -128,8 +130,10 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
             raise ValueError("One of model or topics has to be provided.")
         elif topics is not None and dictionary is None:
             raise ValueError("dictionary has to be provided if topics are to be used.")
+
         if texts is None and corpus is None:
             raise ValueError("One of texts or corpus has to be provided.")
+
         # Check if associated dictionary is provided.
         if dictionary is None:
             if isinstance(model.id2word, FakeDict):
@@ -139,6 +143,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                 self.dictionary = model.id2word
         else:
             self.dictionary = dictionary
+
         # Check for correct inputs for u_mass coherence measure.
         if coherence in boolean_document_based:
             if is_corpus(corpus)[0]:
@@ -148,6 +153,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                 self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
             else:
                 raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
+
         # Check for correct inputs for c_v coherence measure.
         elif coherence in sliding_window_based:
             self.window_size = window_size
@@ -157,6 +163,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                 self.texts = texts
         else:
             raise ValueError("%s coherence is not currently supported." % coherence)
+
         self.topn = topn
         self.model = model
         if model is not None:
@@ -193,27 +200,29 @@ def _get_topics(self):
                              "LdaModel, LdaVowpalWabbit and LdaMallet.")
         return topics
 
-    def get_coherence(self):
-        """
-        Return coherence value based on pipeline parameters.
-        """
+    def get_coherence_per_topic(self):
         measure = coherence_dict[self.coherence]
         segmented_topics = measure.seg(self.topics)
+
         if self.coherence in boolean_document_based:
             per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics)
-            confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs)
-        elif self.coherence in sliding_window_based:
-            if self.window_size is not None:
-                self.window_size = sliding_windows_dict[self.coherence]
-            per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                                           dictionary=self.dictionary, window_size=self.window_size)
-            if self.coherence == 'c_v':
-                confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
-            else:
-                if self.coherence == 'c_npmi':
-                    normalize = True
-                else:
-                    # For c_uci
-                    normalize = False
-                confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize)
+            return measure.conf(segmented_topics, per_topic_postings, num_docs)
+
+        if self.window_size is not None:
+            self.window_size = sliding_windows_dict[self.coherence]
+        per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
+                                                       dictionary=self.dictionary, window_size=self.window_size)
+        if self.coherence == 'c_v':
+            return measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
+        else:
+            normalize = self.coherence == 'c_npmi'
+            return measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize)
+
+    def aggregate_measures(self, confirmed_measures):
+        measure = coherence_dict[self.coherence]
         return measure.aggr(confirmed_measures)
+
+    def get_coherence(self):
+        """Return coherence value based on pipeline parameters."""
+        confirmed_measures = self.get_coherence_per_topic()
+        return self.aggregate_measures(confirmed_measures)
diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py
index 596f91f65b..09d9ee071f 100644
--- a/gensim/test/test_probability_estimation.py
+++ b/gensim/test/test_probability_estimation.py
@@ -13,15 +13,20 @@
 
 from gensim.topic_coherence import probability_estimation
 from gensim.corpora.hashdictionary import HashDictionary
+from gensim.corpora.dictionary import Dictionary
 
-class TestProbabilityEstimation(unittest.TestCase):
+
+class ProbabilityEstimationBase(unittest.TestCase):
+    texts = [['human', 'interface', 'computer'],
+             ['eps', 'user', 'interface', 'system'],
+             ['system', 'human', 'system', 'eps'],
+             ['user', 'response', 'time'],
+             ['trees'],
+             ['graph', 'trees']]
+
+
+class TestProbabilityEstimation(ProbabilityEstimationBase):
     def setUp(self):
-        self.texts = [['human', 'interface', 'computer'],
-                      ['eps', 'user', 'interface', 'system'],
-                      ['system', 'human', 'system', 'eps'],
-                      ['user', 'response', 'time'],
-                      ['trees'],
-                      ['graph', 'trees']]
         self.dictionary = HashDictionary(self.texts)
         # Following is the mapping:
         # {'computer': 10608,
@@ -36,21 +41,77 @@ def setUp(self):
         #  'user': 12736}
         self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
         # Suppose the segmented topics from s_one_pre are:
-        self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]]
+        self.segmented_topics = [
+            [
+                (5798, 18451),
+                (10608, 18451),
+                (10608, 5798)
+            ], [
+                (10608, 18451),
+                (12736, 18451),
+                (12736, 10608)
+            ]
+        ]
 
     def testPBooleanDocument(self):
         """Test p_boolean_document()"""
         # Unique topic ids are 5798, 10608, 12736 and 18451
         obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
-        expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])}
-        self.assertTrue(obtained == expected)
+        expected = {18451: {5}, 12736: {1, 3}, 5798: {1, 2}, 10608: {0}}
+        self.assertEqual(expected, obtained)
+
+    def testPBooleanSlidingWindow(self):
+        """Test p_boolean_sliding_window()"""
+        # Test with window size as 2. window_id is zero indexed.
+        obtained, _ = probability_estimation.p_boolean_sliding_window(
+            self.texts, self.segmented_topics, self.dictionary, 2)
+        expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {11}, 5798: {4, 5, 6, 7}}
+        self.assertEqual(expected, obtained)
+
+
+class TestProbabilityEstimationWithNormalDictionary(ProbabilityEstimationBase):
+    def setUp(self):
+        self.dictionary = Dictionary(self.texts)
+        self.dictionary.id2token = {v: k for k, v in self.dictionary.token2id.items()}
+        # Following is the mapping:
+        # {u'computer': 1,
+        #  u'eps': 5,
+        #  u'graph': 9,
+        #  u'human': 2,
+        #  u'interface': 0,
+        #  u'response': 6,
+        #  u'system': 4,
+        #  u'time': 7,
+        #  u'trees': 8,
+        #  u'user': 3}
+        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
+        # Suppose the segmented topics from s_one_pre are:
+        self.segmented_topics = [
+            [
+                (4, 9),
+                (1, 9),
+                (1, 4)
+            ], [
+                (1, 9),
+                (3, 9),
+                (3, 1)
+            ]
+        ]
+
+    def testPBooleanDocument(self):
+        """Test p_boolean_document()"""
+        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
+        expected = {9: {5}, 3: {1, 3}, 4: {1, 2}, 1: {0}}
+        self.assertEqual(expected, obtained)
 
     def testPBooleanSlidingWindow(self):
         """Test p_boolean_sliding_window()"""
         # Test with window size as 2. window_id is zero indexed.
-        obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2)
-        expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])}
-        self.assertTrue(obtained == expected)
+        obtained, _ = probability_estimation.p_boolean_sliding_window(
+            self.texts, self.segmented_topics, self.dictionary, 2)
+        expected = {1: {1}, 3: {8, 2, 3}, 9: {11}, 4: {4, 5, 6, 7}}
+        self.assertEqual(expected, obtained)
+
 
 if __name__ == '__main__':
     logging.root.setLevel(logging.WARNING)
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index 8922c511a3..c7e3c4d3d2 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -9,28 +9,45 @@
 """
 
 import logging
-import numpy as np
-
-from gensim.corpora import Dictionary
-
 from itertools import chain, islice
+from collections import defaultdict
+
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
+
 def _ret_top_ids(segmented_topics):
     """
     Helper function to return a set of all the unique topic ids in segmented topics.
     """
     top_ids = set()  # is a set of all the unique ids contained in topics.
     for s_i in segmented_topics:
-        for id in chain.from_iterable(s_i):
-            if isinstance(id, np.ndarray):
-                for i in id:
+        for word_id in chain.from_iterable(s_i):
+            if isinstance(word_id, np.ndarray):
+                for i in word_id:
                     top_ids.add(i)
             else:
-                top_ids.add(id)
+                top_ids.add(word_id)
+
     return top_ids
 
+
+def _ids_to_words(ids, dictionary):
+    """Convert an iterable of ids to their corresponding words using a dictionary.
+    This function abstracts away the differences between the HashDictionary and the standard one.
+    """
+    top_words = set()
+    for word_id in ids:
+        word = dictionary[word_id]
+        if isinstance(word, set):
+            top_words = top_words.union(word)
+        else:
+            top_words.add(word)
+
+    return top_words
+
+
 def p_boolean_document(corpus, segmented_topics):
     """
     This function performs the boolean document probability estimation. Boolean document estimates the probability
@@ -48,18 +65,65 @@ def p_boolean_document(corpus, segmented_topics):
     """
     top_ids = _ret_top_ids(segmented_topics)
     # Instantiate the dictionary with empty sets for each top_id
-    per_topic_postings = {}
-    for id in top_ids:
-        per_topic_postings[id] = set()
+    per_topic_postings = {word_id: set() for word_id in top_ids}
+
     # Iterate through the documents, appending the document number to the set for each top_id it contains
     for n, document in enumerate(corpus):
         doc_words = frozenset(x[0] for x in document)
         top_ids_in_doc = top_ids.intersection(doc_words)
         if len(top_ids_in_doc) > 0:
-            for id in top_ids_in_doc:
-                per_topic_postings[id].add(n)
-    num_docs = len(corpus)
-    return (per_topic_postings, num_docs)
+            for word_id in top_ids_in_doc:
+                per_topic_postings[word_id].add(n)
+
+    return per_topic_postings, len(corpus)
+
+
+def _iter_windows(texts, window_size):
+    """Produce a generator over the given texts using a sliding window of `window_size`.
+    
+    Args:
+    ----
+    texts: List of string sentences.
+    window_size: Size of sliding window.
+        
+    """
+    for document in texts:
+        it = iter(document)
+        window = tuple(islice(it, window_size))
+        yield window
+
+        for elem in it:
+            window = window[1:] + (elem,)
+            yield window
+
+
+class WordOccurrenceAccumulator(object):
+    """Accumulate word occurrences from a sequence of documents."""
+
+    def __init__(self, relevant_words):
+        """
+        Args:
+        ----
+        relevant_words: the set of words that occurrences should be accumulated for.
+        """
+        self.relevant_words = set(relevant_words)
+        self.window_id = 0  # id of next document to be observed
+        self.word_occurrences = defaultdict(set)  # map from words to ids of docs they occur in
+
+    def filter_to_relevant_words(self, doc):
+        return (word for word in doc if word in self.relevant_words)
+
+    def add_occurrences_from_doc(self, window):
+        for word in self.filter_to_relevant_words(window):
+            self.word_occurrences[word].add(self.window_id)
+
+        self.window_id += 1
+
+    def accumulate(self, texts, window_size):
+        for virtual_document in _iter_windows(texts, window_size):
+            self.add_occurrences_from_doc(virtual_document)
+        return self
+
 
 def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     """
@@ -81,26 +145,13 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     window_id[0] : Total no of windows
     """
     top_ids = _ret_top_ids(segmented_topics)
-    window_id = 0  # Each window assigned a window id.
-    per_topic_postings = {}
-    token2id_dict = dictionary.token2id
-    def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict):
-        for word in window:
-            word_id = token2id_dict[word]
-            if word_id in top_ids:
-                if word_id in per_topic_postings:
-                    per_topic_postings[word_id].add(window_id)
-                else:
-                    per_topic_postings[word_id] = set([window_id])
-        window_id += 1
-        return (window_id, per_topic_postings)
-    # Apply boolean sliding window to each document in texts.
-    for document in texts:
-        it = iter(document)
-        window = tuple(islice(it, window_size))
-        window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict)
-        for elem in it:
-            window = window[1:] + (elem,)
-            window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict)
+    top_words = _ids_to_words(top_ids, dictionary)
+    occurrence_accumulator = WordOccurrenceAccumulator(top_words)\
+        .accumulate(texts, window_size)
+
+    # Replace words with their ids.
+    occurrences = occurrence_accumulator.word_occurrences
+    per_topic_postings = {dictionary.token2id[word]: id_set
+                          for word, id_set in occurrences.iteritems()}
 
-    return per_topic_postings, window_id
+    return per_topic_postings, occurrence_accumulator.window_id

From f69a2ffa7fe2b9254c61393d057201fa4a331ed7 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 22 May 2017 16:14:19 -0400
Subject: [PATCH 02/33] #1342: Do not produce sliding windows for texts with no
 relevant words, and ensure each relevant word has a set in the
 `per_topic_postings` dict.

---
 gensim/test/test_probability_estimation.py       |  4 ++--
 gensim/topic_coherence/probability_estimation.py | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py
index 09d9ee071f..68ac24e752 100644
--- a/gensim/test/test_probability_estimation.py
+++ b/gensim/test/test_probability_estimation.py
@@ -65,7 +65,7 @@ def testPBooleanSlidingWindow(self):
         # Test with window size as 2. window_id is zero indexed.
         obtained, _ = probability_estimation.p_boolean_sliding_window(
             self.texts, self.segmented_topics, self.dictionary, 2)
-        expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {11}, 5798: {4, 5, 6, 7}}
+        expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {10}, 5798: {4, 5, 6, 7}}
         self.assertEqual(expected, obtained)
 
 
@@ -109,7 +109,7 @@ def testPBooleanSlidingWindow(self):
         # Test with window size as 2. window_id is zero indexed.
         obtained, _ = probability_estimation.p_boolean_sliding_window(
             self.texts, self.segmented_topics, self.dictionary, 2)
-        expected = {1: {1}, 3: {8, 2, 3}, 9: {11}, 4: {4, 5, 6, 7}}
+        expected = {1: {1}, 3: {8, 2, 3}, 9: {10}, 4: {4, 5, 6, 7}}
         self.assertEqual(expected, obtained)
 
 
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index c7e3c4d3d2..ff9c0708bc 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -119,8 +119,15 @@ def add_occurrences_from_doc(self, window):
 
         self.window_id += 1
 
+    def text_is_relevant(self, text):
+        for word in text:
+            if word in self.relevant_words:
+                return True
+        return False
+
     def accumulate(self, texts, window_size):
-        for virtual_document in _iter_windows(texts, window_size):
+        relevant_texts = (text for text in texts if self.text_is_relevant(text))
+        for virtual_document in _iter_windows(relevant_texts, window_size):
             self.add_occurrences_from_doc(virtual_document)
         return self
 
@@ -154,4 +161,9 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     per_topic_postings = {dictionary.token2id[word]: id_set
                           for word, id_set in occurrences.iteritems()}
 
+    # Ensure all top ids have a corresponding set, even if it's an empty one.
+    for word_id in top_ids:
+        if word_id not in per_topic_postings:
+            per_topic_postings[word_id] = set()
+
     return per_topic_postings, occurrence_accumulator.window_id

From 26de54726b0790be73e2fe70614c13f9a2334f0e Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 22 May 2017 16:23:16 -0400
Subject: [PATCH 03/33] #1342: Remove unused multiprocessing import in
 `coherencemodel` module.

---
 gensim/models/coherencemodel.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 130c285822..4e110681e2 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -19,20 +19,18 @@
 """
 
 import logging
-import multiprocessing as mp
+from collections import namedtuple
+
+import numpy as np
 
 from gensim import interfaces
+from gensim.matutils import argsort
+from gensim.models.ldamodel import LdaModel
+from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 from gensim.topic_coherence import (segmentation, probability_estimation,
                                     direct_confirmation_measure, indirect_confirmation_measure,
                                     aggregation)
-from gensim.matutils import argsort
 from gensim.utils import is_corpus, FakeDict
-from gensim.models.ldamodel import LdaModel
-from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
-
-import numpy as np
-
-from collections import namedtuple
 
 logger = logging.getLogger(__name__)
 

From dfe159b17dd4e8a79e3cad96d53b4f54079452b4 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 24 May 2017 11:07:24 -0400
Subject: [PATCH 04/33] add utility functions for strided windowing of texts
 (lists of strings representation of corpus)

---
 gensim/test/test_utils.py | 68 +++++++++++++++++++++++++++++++++++++--
 gensim/utils.py           | 50 ++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py
index 41f20eb232..cbdac0170b 100644
--- a/gensim/test/test_utils.py
+++ b/gensim/test/test_utils.py
@@ -13,6 +13,8 @@
 
 from gensim import utils
 from six import iteritems
+import numpy as np
+
 
 class TestIsCorpus(unittest.TestCase):
     def test_None(self):
@@ -90,8 +92,70 @@ def test_sample_dict(self):
         self.assertEqual(sampled_dict,expected_dict)
         sampled_dict_random = utils.sample_dict(d,2)
         if sampled_dict_random in expected_dict_random:
-            self.assertTrue(True)    
-
+            self.assertTrue(True)
+
+
+class TestWindowing(unittest.TestCase):
+
+    arr10_5 = np.array([
+        [0, 1, 2, 3, 4],
+        [1, 2, 3, 4, 5],
+        [2, 3, 4, 5, 6],
+        [3, 4, 5, 6, 7],
+        [4, 5, 6, 7, 8],
+        [5, 6, 7, 8, 9]
+    ])
+
+    def _assert_arrays_equal(self, expected, actual):
+        self.assertEqual(expected.shape, actual.shape)
+        self.assertTrue((actual == expected).all())
+
+    def test_strided_windows1(self):
+        out = utils.strided_windows(range(5), 2)
+        expected = np.array([
+            [0, 1],
+            [1, 2],
+            [2, 3],
+            [3, 4]
+        ])
+        self._assert_arrays_equal(expected, out)
+
+    def test_strided_windows2(self):
+        input_arr = np.arange(10)
+        out = utils.strided_windows(input_arr, 5)
+        expected = self.arr10_5.copy()
+        self._assert_arrays_equal(expected, out)
+        out[0, 0] = 10
+        self.assertEqual(10, input_arr[0], "should make view rather than copy")
+
+    def test_iter_windows_list_texts(self):
+        texts = [['this', 'is', 'a'], ['test', 'document']]
+        windows = list(utils.iter_windows(texts, 2))
+        list_windows = [list(iterable) for iterable in windows]
+        expected = [['this', 'is'], ['is', 'a'], ['test', 'document']]
+        self.assertListEqual(list_windows, expected)
+
+    def test_iter_windows_uses_views(self):
+        texts = [np.array(['this', 'is', 'a'], dtype='object'), ['test', 'document']]
+        windows = list(utils.iter_windows(texts, 2))
+        list_windows = [list(iterable) for iterable in windows]
+        expected = [['this', 'is'], ['is', 'a'], ['test', 'document']]
+        self.assertListEqual(list_windows, expected)
+        windows[0][0] = 'modified'
+        self.assertEqual('modified', texts[0][0])
+
+    def test_iter_windows_with_copy(self):
+        texts = [
+            np.array(['this', 'is', 'a'], dtype='object'),
+            np.array(['test', 'document'], dtype='object')
+        ]
+        windows = list(utils.iter_windows(texts, 2, copy=True))
+
+        windows[0][0] = 'modified'
+        self.assertEqual('this', texts[0][0])
+
+        windows[2][0] = 'modified'
+        self.assertEqual('test', texts[1][0])
 
 
 if __name__ == '__main__':
diff --git a/gensim/utils.py b/gensim/utils.py
index 8d5fdb7d7f..36d70b1927 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -1188,3 +1188,53 @@ def sample_dict(d, n=10, use_random=True):
     """
     selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
     return [(key, d[key]) for key in selected_keys]
+
+
+def strided_windows(ndarray, window_size):
+    """
+    Produce a numpy.ndarray of windows, as from a sliding window.
+    
+    >>> strided_windows(np.arange(5), 2)
+    array([[0, 1],
+           [1, 2],
+           [2, 3],
+           [3, 4]])
+    >>> strided_windows(np.arange(10), 5)
+    array([[0, 1, 2, 3, 4],
+           [1, 2, 3, 4, 5],
+           [2, 3, 4, 5, 6],
+           [3, 4, 5, 6, 7],
+           [4, 5, 6, 7, 8],
+           [5, 6, 7, 8, 9]])
+    
+    Args:
+    ----
+    ndarray: either a numpy.ndarray or something that can be converted into one.
+    window_size: sliding window size.
+    :param window_size: 
+    :return: numpy.ndarray of the subsequences produced by sliding a window of the given size over
+             the `ndarray`. Since this uses striding, the individual arrays are views rather than
+             copies of `ndarray`. Changes to one view modifies the others and the original.
+    """
+    ndarray = np.asarray(ndarray)
+    stride = ndarray.strides[0]
+    return np.lib.stride_tricks.as_strided(
+        ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),
+        strides=(stride, stride))
+
+
+def iter_windows(texts, window_size, copy=False):
+    """Produce a generator over the given texts using a sliding window of `window_size`.
+    The windows produced are views of some subsequence of a text. To use deep copies
+    instead, pass `copy=True`.
+    
+    Args:
+    ----
+    texts: List of string sentences.
+    window_size: Size of sliding window.
+    copy: False to use views of the texts (default) or True to produce deep copies.
+        
+    """
+    for document in texts:
+        for doc_window in strided_windows(document, window_size):
+            yield doc_window.copy() if copy else doc_window

From 2e3852ef9974259fd28402591893498af0b8e7c0 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 24 May 2017 15:51:58 -0400
Subject: [PATCH 05/33] handle edge cases with window_size equal to or
 exceeding document size in strided_windows and iter_windows utiltity
 functions

---
 gensim/test/test_text_analysis.py       |  0
 gensim/test/test_utils.py               | 22 ++++++++++++++++++++++
 gensim/topic_coherence/text_analysis.py |  0
 gensim/utils.py                         | 18 +++++++++++++++---
 4 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 gensim/test/test_text_analysis.py
 create mode 100644 gensim/topic_coherence/text_analysis.py

diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py
index cbdac0170b..612d55dd68 100644
--- a/gensim/test/test_utils.py
+++ b/gensim/test/test_utils.py
@@ -128,6 +128,28 @@ def test_strided_windows2(self):
         out[0, 0] = 10
         self.assertEqual(10, input_arr[0], "should make view rather than copy")
 
+    def test_strided_windows_window_size_exceeds_size(self):
+        input_arr = np.array(['this', 'is', 'test'], dtype='object')
+        out = utils.strided_windows(input_arr, 4)
+        expected = np.ndarray((0, 0))
+        self._assert_arrays_equal(expected, out)
+
+    def test_strided_windows_window_size_equals_size(self):
+        input_arr = np.array(['this', 'is', 'test'], dtype='object')
+        out = utils.strided_windows(input_arr, 3)
+        expected = np.array([input_arr.copy()])
+        self._assert_arrays_equal(expected, out)
+
+    def test_iter_windows_include_below_window_size(self):
+        texts = [['this', 'is', 'a'], ['test', 'document']]
+        out = utils.iter_windows(texts, 3, ignore_below_size=False)
+        windows = [list(w) for w in out]
+        self.assertEqual(texts, windows)
+
+        out = utils.iter_windows(texts, 3)
+        windows = [list(w) for w in out]
+        self.assertEqual([texts[0]], windows)
+
     def test_iter_windows_list_texts(self):
         texts = [['this', 'is', 'a'], ['test', 'document']]
         windows = list(utils.iter_windows(texts, 2))
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gensim/utils.py b/gensim/utils.py
index 36d70b1927..3a191f1a9a 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -1217,13 +1217,18 @@ def strided_windows(ndarray, window_size):
              copies of `ndarray`. Changes to one view modifies the others and the original.
     """
     ndarray = np.asarray(ndarray)
+    if window_size == ndarray.shape[0]:
+        return np.array([ndarray])
+    elif window_size > ndarray.shape[0]:
+        return np.ndarray((0, 0))
+
     stride = ndarray.strides[0]
     return np.lib.stride_tricks.as_strided(
         ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),
         strides=(stride, stride))
 
 
-def iter_windows(texts, window_size, copy=False):
+def iter_windows(texts, window_size, copy=False, ignore_below_size=True):
     """Produce a generator over the given texts using a sliding window of `window_size`.
     The windows produced are views of some subsequence of a text. To use deep copies
     instead, pass `copy=True`.
@@ -1233,8 +1238,15 @@ def iter_windows(texts, window_size, copy=False):
     texts: List of string sentences.
     window_size: Size of sliding window.
     copy: False to use views of the texts (default) or True to produce deep copies.
+    ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior).
+                       If False, the documents below `window_size` will be yielded as the full document.
         
     """
     for document in texts:
-        for doc_window in strided_windows(document, window_size):
-            yield doc_window.copy() if copy else doc_window
+        doc_windows = strided_windows(document, window_size)
+        if doc_windows.shape[0] == 0:
+            if not ignore_below_size:
+                yield document.copy() if copy else document
+        else:
+            for doc_window in doc_windows:
+                yield doc_window.copy() if copy else doc_window

From ec7af1bd89d99210599b183d0a3b018dd7cde7c9 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 24 May 2017 16:00:07 -0400
Subject: [PATCH 06/33] move code for building inverted index into a new
 text_analysis module and add initial impl of accumulator that directly tracks
 term occurrence and co-occurrence counts

---
 gensim/test/test_text_analysis.py             |  84 +++++++++++
 .../topic_coherence/probability_estimation.py |  74 +---------
 gensim/topic_coherence/text_analysis.py       | 132 ++++++++++++++++++
 3 files changed, 222 insertions(+), 68 deletions(-)

diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index e69de29bb2..d7b4695ac6 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -0,0 +1,84 @@
+import logging
+import unittest
+
+from gensim.topic_coherence.text_analysis import \
+    InvertedIndexAccumulator, WordOccurrenceAccumulator
+
+
+class BaseTestCases(object):
+
+    class TextAnalyzerTestBase(unittest.TestCase):
+        texts = [
+            ['this', 'is', 'a'],
+            ['test', 'document'],
+            ['this', 'test', 'document']
+        ]
+        token2id = {
+            'this': 10,
+            'is': 15,
+            'a': 20,
+            'test': 21,
+            'document': 17
+        }
+        top_words = token2id.keys()
+
+        accumulator_cls = None
+
+        def test_occurrence_counting(self):
+            accumulator = self.accumulator_cls(self.top_words, self.token2id) \
+                .accumulate(self.texts, 3)
+            self.assertEqual(2, accumulator.get_occurrences("this"))
+            self.assertEqual(1, accumulator.get_occurrences("is"))
+            self.assertEqual(1, accumulator.get_occurrences("a"))
+
+            self.assertEqual(2, accumulator.get_co_occurrences("test", "document"))
+            self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))
+
+        def test_occurences_for_irrelevant_words(self):
+            accumulator = WordOccurrenceAccumulator(self.top_words, self.token2id) \
+                .accumulate(self.texts, 2)
+            with self.assertRaises(KeyError):
+                accumulator.get_occurrences("irrelevant")
+            with self.assertRaises(KeyError):
+                accumulator.get_co_occurrences("test", "irrelevant")
+
+
+class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase):
+    accumulator_cls = InvertedIndexAccumulator
+
+    def test_accumulate1(self):
+        accumulator = InvertedIndexAccumulator(self.top_words, self.token2id)\
+            .accumulate(self.texts, 2)
+        # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], ['test', 'document']]
+        inverted_index = accumulator.index_to_dict()
+        expected = {
+            10: {0, 3},
+            15: {0, 1},
+            20: {1},
+            21: {2, 3, 4},
+            17: {2, 4}
+        }
+        self.assertDictEqual(expected, inverted_index)
+
+    def test_accumulate2(self):
+        accumulator = InvertedIndexAccumulator(self.top_words, self.token2id) \
+            .accumulate(self.texts, 3)
+        # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document']]
+        inverted_index = accumulator.index_to_dict()
+        expected = {
+            10: {0, 2},
+            15: {0},
+            20: {0},
+            21: {1, 2},
+            17: {1, 2}
+        }
+        self.assertDictEqual(expected, inverted_index)
+
+
+class TestWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase):
+    accumulator_cls = WordOccurrenceAccumulator
+
+
+if __name__ == '__main__':
+    logging.root.setLevel(logging.WARNING)
+    unittest.main()
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index ff9c0708bc..c7f5ca4dca 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -9,11 +9,12 @@
 """
 
 import logging
-from itertools import chain, islice
-from collections import defaultdict
+import itertools
 
 import numpy as np
 
+from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator
+
 logger = logging.getLogger(__name__)
 
 
@@ -23,7 +24,7 @@ def _ret_top_ids(segmented_topics):
     """
     top_ids = set()  # is a set of all the unique ids contained in topics.
     for s_i in segmented_topics:
-        for word_id in chain.from_iterable(s_i):
+        for word_id in itertools.chain.from_iterable(s_i):
             if isinstance(word_id, np.ndarray):
                 for i in word_id:
                     top_ids.add(i)
@@ -78,60 +79,6 @@ def p_boolean_document(corpus, segmented_topics):
     return per_topic_postings, len(corpus)
 
 
-def _iter_windows(texts, window_size):
-    """Produce a generator over the given texts using a sliding window of `window_size`.
-    
-    Args:
-    ----
-    texts: List of string sentences.
-    window_size: Size of sliding window.
-        
-    """
-    for document in texts:
-        it = iter(document)
-        window = tuple(islice(it, window_size))
-        yield window
-
-        for elem in it:
-            window = window[1:] + (elem,)
-            yield window
-
-
-class WordOccurrenceAccumulator(object):
-    """Accumulate word occurrences from a sequence of documents."""
-
-    def __init__(self, relevant_words):
-        """
-        Args:
-        ----
-        relevant_words: the set of words that occurrences should be accumulated for.
-        """
-        self.relevant_words = set(relevant_words)
-        self.window_id = 0  # id of next document to be observed
-        self.word_occurrences = defaultdict(set)  # map from words to ids of docs they occur in
-
-    def filter_to_relevant_words(self, doc):
-        return (word for word in doc if word in self.relevant_words)
-
-    def add_occurrences_from_doc(self, window):
-        for word in self.filter_to_relevant_words(window):
-            self.word_occurrences[word].add(self.window_id)
-
-        self.window_id += 1
-
-    def text_is_relevant(self, text):
-        for word in text:
-            if word in self.relevant_words:
-                return True
-        return False
-
-    def accumulate(self, texts, window_size):
-        relevant_texts = (text for text in texts if self.text_is_relevant(text))
-        for virtual_document in _iter_windows(relevant_texts, window_size):
-            self.add_occurrences_from_doc(virtual_document)
-        return self
-
-
 def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     """
     This function performs the boolean sliding window probability estimation. Boolean sliding window
@@ -153,17 +100,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     """
     top_ids = _ret_top_ids(segmented_topics)
     top_words = _ids_to_words(top_ids, dictionary)
-    occurrence_accumulator = WordOccurrenceAccumulator(top_words)\
+    occurrence_accumulator = InvertedIndexAccumulator(top_words, dictionary.token2id)\
         .accumulate(texts, window_size)
 
-    # Replace words with their ids.
-    occurrences = occurrence_accumulator.word_occurrences
-    per_topic_postings = {dictionary.token2id[word]: id_set
-                          for word, id_set in occurrences.iteritems()}
-
-    # Ensure all top ids have a corresponding set, even if it's an empty one.
-    for word_id in top_ids:
-        if word_id not in per_topic_postings:
-            per_topic_postings[word_id] = set()
-
+    per_topic_postings = occurrence_accumulator.index_to_dict()
     return per_topic_postings, occurrence_accumulator.window_id
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index e69de29bb2..03baec13d3 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+This module contains classes for analyzing the texts of a corpus to accumulate
+statistical information about word occurrences.
+"""
+
+import itertools
+
+import numpy as np
+import scipy.sparse as sps
+
+from gensim import utils
+
+
+class TextsAnalyzer(object):
+    """Gather some statistics about relevant terms a corpus by iterating over texts."""
+
+    def __init__(self, relevant_words, token2id):
+        """
+        Args:
+        ----
+        relevant_words: the set of words that occurrences should be accumulated for.
+        """
+        self.relevant_words = set(relevant_words)
+        self.relevant_ids = set(token2id[word] for word in self.relevant_words)
+        self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)}
+        self.token2id = token2id
+
+    def filter_to_relevant_words(self, text):
+        """Lazily filter the text to only those words which are relevant."""
+        relevant_words = (word for word in text if word in self.relevant_words)
+        relevant_ids = (self.token2id[word] for word in relevant_words)
+        return (self.id2contiguous[word_id] for word_id in relevant_ids)
+
+    def text_is_relevant(self, text):
+        """Return True if the text has any relevant words, else False."""
+        for word in text:
+            if word in self.relevant_words:
+                return True
+        return False
+
+    def analyze_text(self, text):
+        raise NotImplementedError("Base classes should implement analyze_text.")
+
+    def accumulate(self, texts, window_size):
+        relevant_texts = (text for text in texts if self.text_is_relevant(text))
+        for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False):
+            self.analyze_text(virtual_document)
+        return self
+
+    def get_occurrences(self, word):
+        """Return number of docs the word occurs in, once `accumulate` has been called."""
+        word_id = self.token2id[word]
+        return self._get_occurrences(self.id2contiguous[word_id])
+
+    def _get_occurrences(self, word_id):
+        raise NotImplementedError("Base classes should implement occurrences")
+
+    def get_co_occurrences(self, word1, word2):
+        """Return number of docs the words co-occur in, once `accumulate` has been called."""
+        word_id1 = self.token2id[word1]
+        word_id2 = self.token2id[word2]
+        return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2])
+
+    def _get_co_occurrences(self, word_id1, word_id2):
+        raise NotImplementedError("Base classes should implement co_occurrences")
+
+
+class InvertedIndexAccumulator(TextsAnalyzer):
+    """Build an inverted index from a sequence of corpus texts."""
+
+    def __init__(self, *args):
+        super(InvertedIndexAccumulator, self).__init__(*args)
+        self.window_id = 0  # id of next document to be observed
+        vocab_size = len(self.relevant_words)
+        self._inverted_index = np.array([set() for _ in range(vocab_size)])
+
+    def analyze_text(self, window):
+        for word_id in self.filter_to_relevant_words(window):
+            self._inverted_index[word_id].add(self.window_id)
+
+        self.window_id += 1
+
+    def index_to_dict(self):
+        contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()}
+        return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
+
+    def _get_occurrences(self, word_id):
+        return len(self._inverted_index[word_id])
+
+    def _get_co_occurrences(self, word_id1, word_id2):
+        s1 = self._inverted_index[word_id1]
+        s2 = self._inverted_index[word_id2]
+        return len(s1.intersection(s2))
+
+
+class WordOccurrenceAccumulator(TextsAnalyzer):
+    """Accumulate word occurrences and co-occurrences from a corpus of texts."""
+
+    def __init__(self, *args):
+        super(WordOccurrenceAccumulator, self).__init__(*args)
+        vocab_size = len(self.relevant_words)
+        self._occurrences = np.zeros(vocab_size, dtype='uint32')
+        self._co_occurrences = sps.lil_matrix((vocab_size, vocab_size), dtype='uint32')
+
+    def analyze_text(self, window):
+        relevant_words = list(self.filter_to_relevant_words(window))
+        uniq_words = np.array(relevant_words)
+        self._occurrences[uniq_words] += 1
+
+        for combo in itertools.combinations(relevant_words, 2):
+            self._co_occurrences[combo] += 1
+
+    def _symmetrize(self):
+        co_occ = self._co_occurrences
+        return co_occ + co_occ.T - np.diag(co_occ.diagonal())
+
+    def accumulate(self, texts, window_size):
+        super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
+        self._symmetrize()
+        return self
+
+    def _get_occurrences(self, word_id):
+        return self._occurrences[word_id]
+
+    def _get_co_occurrences(self, word_id1, word_id2):
+        return self._co_occurrences[word_id1, word_id2]

From 3f8fb7f52c788d135fbd4da809c97677c85bceb9 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 24 May 2017 18:40:04 -0400
Subject: [PATCH 07/33] complete migration to using the accumulators in the new
 text_analysis package for all confirmation measures in the CoherenceModel
 pipeline

---
 gensim/models/coherencemodel.py               |  19 +-
 gensim/test/test_coherencemodel.py            |   7 +-
 gensim/test/test_direct_confirmation.py       |  19 +-
 gensim/test/test_text_analysis.py             |  13 +-
 .../direct_confirmation_measure.py            |  41 ++--
 .../indirect_confirmation_measure.py          |  17 +-
 .../topic_coherence/probability_estimation.py |  37 +---
 gensim/topic_coherence/text_analysis.py       | 176 +++++++++++++-----
 8 files changed, 202 insertions(+), 127 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 4e110681e2..9888dcb3d6 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -63,6 +63,7 @@
     'c_npmi': 10
 }
 
+
 class CoherenceModel(interfaces.TransformationABC):
     """
     Objects of this class allow for building and maintaining a model for topic
@@ -143,6 +144,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
             self.dictionary = dictionary
 
         # Check for correct inputs for u_mass coherence measure.
+        self.coherence = coherence
         if coherence in boolean_document_based:
             if is_corpus(corpus)[0]:
                 self.corpus = corpus
@@ -155,6 +157,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
         # Check for correct inputs for c_v coherence measure.
         elif coherence in sliding_window_based:
             self.window_size = window_size
+            if self.window_size is None:
+                self.window_size = sliding_windows_dict[self.coherence]
             if texts is None:
                 raise ValueError("'texts' should be provided for %s coherence." % coherence)
             else:
@@ -173,7 +177,6 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                 for n, _ in enumerate(topic):
                     t_i.append(dictionary.token2id[topic[n]])
                 self.topics.append(np.array(t_i))
-        self.coherence = coherence
 
     def __str__(self):
         return coherence_dict[self.coherence].__str__()
@@ -203,18 +206,16 @@ def get_coherence_per_topic(self):
         segmented_topics = measure.seg(self.topics)
 
         if self.coherence in boolean_document_based:
-            per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics)
-            return measure.conf(segmented_topics, per_topic_postings, num_docs)
+            accumulator = measure.prob(self.corpus, segmented_topics)
+            return measure.conf(segmented_topics, accumulator)
 
-        if self.window_size is not None:
-            self.window_size = sliding_windows_dict[self.coherence]
-        per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                                       dictionary=self.dictionary, window_size=self.window_size)
+        accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
+                                   dictionary=self.dictionary, window_size=self.window_size)
         if self.coherence == 'c_v':
-            return measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
+            return measure.conf(self.topics, segmented_topics, accumulator, 'nlr', 1)
         else:
             normalize = self.coherence == 'c_npmi'
-            return measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize)
+            return measure.conf(segmented_topics, accumulator, normalize=normalize)
 
     def aggregate_measures(self, confirmed_measures):
         measure = coherence_dict[self.coherence]
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 3961f67180..d69aaf0dad 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -14,7 +14,7 @@
 import os.path
 import tempfile
 
-from gensim.models.coherencemodel import CoherenceModel
+from gensim.models.coherencemodel import CoherenceModel, boolean_document_based
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaMallet
 from gensim.models.wrappers import LdaVowpalWabbit
@@ -35,14 +35,13 @@
          ['graph', 'minors', 'survey']]
 dictionary = Dictionary(texts)
 corpus = [dictionary.doc2bow(text) for text in texts]
-boolean_document_based = ['u_mass']
-sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
 
 
 def testfile():
     # temporary data will be stored to this file
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
+
 def checkCoherenceMeasure(topics1, topics2, coherence):
     """Check provided topic coherence algorithm on given topics"""
     if coherence in boolean_document_based:
@@ -53,6 +52,7 @@ def checkCoherenceMeasure(topics1, topics2, coherence):
         cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence)
     return cm1.get_coherence() > cm2.get_coherence()
 
+
 class TestCoherenceModel(unittest.TestCase):
     def setUp(self):
         # Suppose given below are the topics which two different LdaModels come up with.
@@ -219,6 +219,7 @@ def testPersistenceCompressed(self):
         model2 = CoherenceModel.load(fname)
         self.assertTrue(model.get_coherence() == model2.get_coherence())
 
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()
diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py
index cb35f0acc4..ad39b99b62 100644
--- a/gensim/test/test_direct_confirmation.py
+++ b/gensim/test/test_direct_confirmation.py
@@ -10,38 +10,49 @@
 
 import logging
 import unittest
+from collections import namedtuple
 
 from gensim.topic_coherence import direct_confirmation_measure
+from gensim.topic_coherence import text_analysis
+
 
 class TestDirectConfirmationMeasure(unittest.TestCase):
     def setUp(self):
         # Set up toy example for better understanding and testing
         # of this module. See the modules for the mathematical formulas
         self.segmentation = [[(1, 2)]]
-        self.posting_list = {1: set([2, 3, 4]), 2: set([3, 5])}
+        self.posting_list = {1: {2, 3, 4}, 2: {3, 5}}
         self.num_docs = 5
 
+        id2token = {1: 'test', 2: 'doc'}
+        token2id = {v: k for k, v in id2token.items()}
+        dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+        self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
+        self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
+        self.accumulator._num_docs = self.num_docs
+
     def testLogConditionalProbability(self):
         """Test log_conditional_probability()"""
-        obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.posting_list, self.num_docs)[0]
+        obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.accumulator)[0]
         # Answer should be ~ ln(1 / 2) = -0.693147181
         expected = -0.693147181
         self.assertAlmostEqual(obtained, expected)
 
     def testLogRatioMeasure(self):
         """Test log_ratio_measure()"""
-        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0]
+        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator)[0]
         # Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
         expected = -0.182321557
         self.assertAlmostEqual(obtained, expected)
 
     def testNormalizedLogRatioMeasure(self):
         """Test normalized_log_ratio_measure()"""
-        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs, normalize=True)[0]
+        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator, normalize=True)[0]
         # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753
         expected = -0.113282753
         self.assertAlmostEqual(obtained, expected)
 
+
 if __name__ == '__main__':
     logging.root.setLevel(logging.WARNING)
     unittest.main()
diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index d7b4695ac6..27eecbc645 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -1,5 +1,6 @@
 import logging
 import unittest
+from collections import namedtuple
 
 from gensim.topic_coherence.text_analysis import \
     InvertedIndexAccumulator, WordOccurrenceAccumulator
@@ -20,12 +21,14 @@ class TextAnalyzerTestBase(unittest.TestCase):
             'test': 21,
             'document': 17
         }
-        top_words = token2id.keys()
+        id2token = {v: k for k, v in token2id.items()}
+        dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+        top_ids = set(token2id.values())
 
         accumulator_cls = None
 
         def test_occurrence_counting(self):
-            accumulator = self.accumulator_cls(self.top_words, self.token2id) \
+            accumulator = self.accumulator_cls(self.top_ids, self.dictionary) \
                 .accumulate(self.texts, 3)
             self.assertEqual(2, accumulator.get_occurrences("this"))
             self.assertEqual(1, accumulator.get_occurrences("is"))
@@ -35,7 +38,7 @@ def test_occurrence_counting(self):
             self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))
 
         def test_occurences_for_irrelevant_words(self):
-            accumulator = WordOccurrenceAccumulator(self.top_words, self.token2id) \
+            accumulator = WordOccurrenceAccumulator(self.top_ids, self.dictionary) \
                 .accumulate(self.texts, 2)
             with self.assertRaises(KeyError):
                 accumulator.get_occurrences("irrelevant")
@@ -47,7 +50,7 @@ class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase):
     accumulator_cls = InvertedIndexAccumulator
 
     def test_accumulate1(self):
-        accumulator = InvertedIndexAccumulator(self.top_words, self.token2id)\
+        accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\
             .accumulate(self.texts, 2)
         # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], ['test', 'document']]
         inverted_index = accumulator.index_to_dict()
@@ -61,7 +64,7 @@ def test_accumulate1(self):
         self.assertDictEqual(expected, inverted_index)
 
     def test_accumulate2(self):
-        accumulator = InvertedIndexAccumulator(self.top_words, self.token2id) \
+        accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary) \
             .accumulate(self.texts, 3)
         # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document']]
         inverted_index = accumulator.index_to_dict()
diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
index f50fb612e2..60631375ef 100644
--- a/gensim/topic_coherence/direct_confirmation_measure.py
+++ b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -15,7 +15,8 @@
 
 EPSILON = 1e-12  # Should be small. Value as suggested in paper.
 
-def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
+
+def log_conditional_probability(segmented_topics, accumulator):
     """
     This function calculates the log-conditional-probability measure
     which is used by coherence measures such as U_mass.
@@ -24,28 +25,29 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
     Args:
     ----
     segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
-    num_docs : Total number of documents in corresponding corpus.
+    accumulator: word occurrence accumulator from probability_estimation.
 
     Returns:
     -------
     m_lc : List of log conditional probability measure on each set in segmented topics.
     """
     m_lc = []
+    num_docs = float(accumulator.num_docs)
     for s_i in segmented_topics:
         for w_prime, w_star in s_i:
-            w_prime_docs = per_topic_postings[w_prime]
-            w_star_docs = per_topic_postings[w_star]
-            co_docs = w_prime_docs.intersection(w_star_docs)
-            if  w_star_docs:
-                m_lc_i = np.log(((len(co_docs) / float(num_docs)) + EPSILON) / (len(w_star_docs) / float(num_docs)))
-            else:
+            try:
+                w_star_count = accumulator[w_star]
+                co_occur_count = accumulator[w_prime, w_star]
+                m_lc_i = np.log(((co_occur_count / num_docs) + EPSILON) / (w_star_count / num_docs))
+            except KeyError:
                 m_lc_i = 0.0
+
             m_lc.append(m_lc_i)
 
     return m_lc
 
-def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False):
+
+def log_ratio_measure(segmented_topics, accumulator, normalize=False):
     """
     If normalize=False:
         Popularly known as PMI.
@@ -61,28 +63,29 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=
     Args:
     ----
     segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
-    num_docs : Total number of documents in corpus. Used for calculating probability.
+    accumulator: word occurrence accumulator from probability_estimation.
 
     Returns:
     -------
     m_lr : List of log ratio measures on each set in segmented topics.
     """
     m_lr = []
+    num_docs = float(accumulator.num_docs)
     for s_i in segmented_topics:
         for w_prime, w_star in s_i:
-            w_prime_docs = per_topic_postings[w_prime]
-            w_star_docs = per_topic_postings[w_star]
-            co_docs = w_prime_docs.intersection(w_star_docs)
+            w_prime_count = accumulator[w_prime]
+            w_star_count = accumulator[w_star]
+            co_occur_count = accumulator[w_prime, w_star]
+
             if normalize:
                 # For normalized log ratio measure
-                numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
-                co_doc_prob = len(co_docs) / float(num_docs)
+                numerator = log_ratio_measure([[(w_prime, w_star)]], accumulator)[0]
+                co_doc_prob = co_occur_count / num_docs
                 m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON))
             else:
                 # For log ratio measure without normalization
-                numerator = (len(co_docs) / float(num_docs)) + EPSILON
-                denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
+                numerator = (co_occur_count / num_docs) + EPSILON
+                denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
                 m_lr_i = np.log(numerator / denominator)
             m_lr.append(m_lr_i)
 
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index c68206a372..c4585ad677 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -48,7 +48,8 @@ def _present(w_prime_star, w, w_backtrack):
         return -1
     return index
 
-def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs):
+
+def _make_seg(w_prime, w, accumulator, measure, gamma, backtrack):
     """
     Internal helper function to return context vectors for segmentations.
     """
@@ -57,7 +58,7 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
         for w_j in w:
             for w_i in w_prime:
                 if (w_i, w_j) not in backtrack:
-                    backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0]
+                    backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], accumulator, measure[1])[0]
                 if w_j not in context_vectors:
                     context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma
                 else:
@@ -65,11 +66,13 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
     else:
         for w_j in w:
             if (w_prime, w_j) not in backtrack:
-                backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0]
+                backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], accumulator, measure[1])[0]
             context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
-    return (context_vectors, backtrack)
 
-def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
+    return context_vectors, backtrack
+
+
+def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma):
     """
     This function calculates the indirect cosine measure. Given context vectors
     _   _         _   _
@@ -116,7 +119,7 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
             if w_backtrack and w_prime_index != -1:
                 w_prime_context_vectors = context_vector_backtrack[w_prime_index]
             else:
-                w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
+                w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, accumulator, measure, gamma, backtrack)
                 backtrack.update(backtrack_i)
                 # Update backtracking lists
                 w_backtrack.append((w_prime, top_words))
@@ -128,7 +131,7 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
             if w_backtrack and w_star_index != -1:
                 w_star_context_vectors = context_vector_backtrack[w_star_index]
             else:
-                w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
+                w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, accumulator, measure, gamma, backtrack)
                 backtrack.update(backtrack_i)
                 # Update all backtracking lists
                 w_backtrack.append((w_star, top_words))
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index c7f5ca4dca..d9982ca409 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -13,7 +13,7 @@
 
 import numpy as np
 
-from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator
+from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator, CorpusAnalyzer
 
 logger = logging.getLogger(__name__)
 
@@ -34,21 +34,6 @@ def _ret_top_ids(segmented_topics):
     return top_ids
 
 
-def _ids_to_words(ids, dictionary):
-    """Convert an iterable of ids to their corresponding words using a dictionary.
-    This function abstracts away the differences between the HashDictionary and the standard one.
-    """
-    top_words = set()
-    for word_id in ids:
-        word = dictionary[word_id]
-        if isinstance(word, set):
-            top_words = top_words.union(word)
-        else:
-            top_words.add(word)
-
-    return top_words
-
-
 def p_boolean_document(corpus, segmented_topics):
     """
     This function performs the boolean document probability estimation. Boolean document estimates the probability
@@ -65,18 +50,8 @@ def p_boolean_document(corpus, segmented_topics):
     num_docs : Total number of documents in corpus.
     """
     top_ids = _ret_top_ids(segmented_topics)
-    # Instantiate the dictionary with empty sets for each top_id
-    per_topic_postings = {word_id: set() for word_id in top_ids}
-
-    # Iterate through the documents, appending the document number to the set for each top_id it contains
-    for n, document in enumerate(corpus):
-        doc_words = frozenset(x[0] for x in document)
-        top_ids_in_doc = top_ids.intersection(doc_words)
-        if len(top_ids_in_doc) > 0:
-            for word_id in top_ids_in_doc:
-                per_topic_postings[word_id].add(n)
-
-    return per_topic_postings, len(corpus)
+    accumulator = CorpusAnalyzer(top_ids).accumulate(corpus)
+    return accumulator
 
 
 def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
@@ -99,9 +74,5 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     window_id[0] : Total no of windows
     """
     top_ids = _ret_top_ids(segmented_topics)
-    top_words = _ids_to_words(top_ids, dictionary)
-    occurrence_accumulator = InvertedIndexAccumulator(top_words, dictionary.token2id)\
+    return InvertedIndexAccumulator(top_ids, dictionary)\
         .accumulate(texts, window_size)
-
-    per_topic_postings = occurrence_accumulator.index_to_dict()
-    return per_topic_postings, occurrence_accumulator.window_id
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 03baec13d3..f175cbe21a 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -17,78 +17,99 @@
 from gensim import utils
 
 
-class TextsAnalyzer(object):
-    """Gather some statistics about relevant terms a corpus by iterating over texts."""
+def _ids_to_words(ids, dictionary):
+    """Convert an iterable of ids to their corresponding words using a dictionary.
+    This function abstracts away the differences between the HashDictionary and the standard one.
+    """
+    if not dictionary.id2token:
+        setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()})
 
-    def __init__(self, relevant_words, token2id):
-        """
-        Args:
-        ----
-        relevant_words: the set of words that occurrences should be accumulated for.
-        """
-        self.relevant_words = set(relevant_words)
-        self.relevant_ids = set(token2id[word] for word in self.relevant_words)
-        self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)}
-        self.token2id = token2id
+    top_words = set()
+    for word_id in ids:
+        word = dictionary.id2token[word_id]
+        if isinstance(word, set):
+            top_words = top_words.union(word)
+        else:
+            top_words.add(word)
 
-    def filter_to_relevant_words(self, text):
-        """Lazily filter the text to only those words which are relevant."""
-        relevant_words = (word for word in text if word in self.relevant_words)
-        relevant_ids = (self.token2id[word] for word in relevant_words)
-        return (self.id2contiguous[word_id] for word_id in relevant_ids)
+    return top_words
 
-    def text_is_relevant(self, text):
-        """Return True if the text has any relevant words, else False."""
-        for word in text:
-            if word in self.relevant_words:
-                return True
-        return False
+
+class BaseAnalyzer(object):
+    """Base class for corpus and text analyzers."""
+
+    def __init__(self, relevant_ids):
+        self.relevant_ids = relevant_ids
+        self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)}
+        self._num_docs = 0
+
+    @property
+    def num_docs(self):
+        return self._num_docs
 
     def analyze_text(self, text):
         raise NotImplementedError("Base classes should implement analyze_text.")
 
-    def accumulate(self, texts, window_size):
-        relevant_texts = (text for text in texts if self.text_is_relevant(text))
-        for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False):
-            self.analyze_text(virtual_document)
-        return self
+    def __getitem__(self, word_or_words):
+        if hasattr(word_or_words, '__iter__'):
+            return self.get_co_occurrences(*word_or_words)
+        else:
+            return self.get_occurrences(word_or_words)
 
-    def get_occurrences(self, word):
+    def get_occurrences(self, word_id):
         """Return number of docs the word occurs in, once `accumulate` has been called."""
-        word_id = self.token2id[word]
         return self._get_occurrences(self.id2contiguous[word_id])
 
     def _get_occurrences(self, word_id):
         raise NotImplementedError("Base classes should implement occurrences")
 
-    def get_co_occurrences(self, word1, word2):
+    def get_co_occurrences(self, word_id1, word_id2):
         """Return number of docs the words co-occur in, once `accumulate` has been called."""
-        word_id1 = self.token2id[word1]
-        word_id2 = self.token2id[word2]
         return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2])
 
     def _get_co_occurrences(self, word_id1, word_id2):
         raise NotImplementedError("Base classes should implement co_occurrences")
 
 
-class InvertedIndexAccumulator(TextsAnalyzer):
-    """Build an inverted index from a sequence of corpus texts."""
+class UsesDictionary(BaseAnalyzer):
+    """Base class for corpus and text analyzers."""
 
-    def __init__(self, *args):
-        super(InvertedIndexAccumulator, self).__init__(*args)
-        self.window_id = 0  # id of next document to be observed
-        vocab_size = len(self.relevant_words)
-        self._inverted_index = np.array([set() for _ in range(vocab_size)])
+    def __init__(self, relevant_ids, dictionary):
+        super(UsesDictionary, self).__init__(relevant_ids)
+        self.relevant_words = _ids_to_words(self.relevant_ids, dictionary)
+        self.token2id = dictionary.token2id
 
-    def analyze_text(self, window):
-        for word_id in self.filter_to_relevant_words(window):
-            self._inverted_index[word_id].add(self.window_id)
+    def analyze_text(self, text):
+        raise NotImplementedError("Base classes should implement analyze_text.")
 
-        self.window_id += 1
+    def get_occurrences(self, word):
+        """Return number of docs the word occurs in, once `accumulate` has been called."""
+        try:
+            word_id = self.token2id[word]
+        except KeyError:
+            word_id = word
+        return self._get_occurrences(self.id2contiguous[word_id])
+
+    def get_co_occurrences(self, word1, word2):
+        """Return number of docs the words co-occur in, once `accumulate` has been called."""
+        try:
+            word_id1 = self.token2id[word1]
+        except KeyError:
+            word_id1 = word1
+        try:
+            word_id2 = self.token2id[word2]
+        except KeyError:
+            word_id2 = word2
+        return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2])
 
-    def index_to_dict(self):
-        contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()}
-        return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
+
+class InvertedIndexBased(BaseAnalyzer):
+    """Analyzer that builds up an inverted index to accumulate stats."""
+
+    def __init__(self, *args):
+        super(InvertedIndexBased, self).__init__(*args)
+        vocab_size = len(self.relevant_ids)
+        self._inverted_index = np.array([set() for _ in range(vocab_size)])
 
     def _get_occurrences(self, word_id):
         return len(self._inverted_index[word_id])
@@ -98,6 +119,67 @@ def _get_co_occurrences(self, word_id1, word_id2):
         s2 = self._inverted_index[word_id2]
         return len(s1.intersection(s2))
 
+    def index_to_dict(self):
+        contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()}
+        return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
+
+
+class CorpusAnalyzer(InvertedIndexBased):
+    """Gather word occurrence stats from a corpus by iterating over its BoW representation."""
+
+    def analyze_text(self, text):
+        doc_words = frozenset(x[0] for x in text)
+        top_ids_in_doc = self.relevant_ids.intersection(doc_words)
+        if len(top_ids_in_doc) > 0:
+            for word_id in top_ids_in_doc:
+                self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)
+
+    def accumulate(self, corpus):
+        for document in corpus:
+            self.analyze_text(document)
+            self._num_docs += 1
+        return self
+
+
+class TextsAnalyzer(UsesDictionary):
+    """Gather some statistics about relevant terms a corpus by iterating over texts."""
+
+    def __init__(self, relevant_ids, dictionary):
+        """
+        Args:
+        ----
+        relevant_words: the set of words that occurrences should be accumulated for.
+        """
+        super(TextsAnalyzer, self).__init__(relevant_ids, dictionary)
+
+    def filter_to_relevant_words(self, text):
+        """Lazily filter the text to only those words which are relevant."""
+        relevant_words = (word for word in text if word in self.relevant_words)
+        relevant_ids = (self.token2id[word] for word in relevant_words)
+        return (self.id2contiguous[word_id] for word_id in relevant_ids)
+
+    def text_is_relevant(self, text):
+        """Return True if the text has any relevant words, else False."""
+        for word in text:
+            if word in self.relevant_words:
+                return True
+        return False
+
+    def accumulate(self, texts, window_size):
+        relevant_texts = (text for text in texts if self.text_is_relevant(text))
+        for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False):
+            self.analyze_text(virtual_document)
+            self._num_docs += 1
+        return self
+
+
+class InvertedIndexAccumulator(TextsAnalyzer, InvertedIndexBased):
+    """Build an inverted index from a sequence of corpus texts."""
+
+    def analyze_text(self, window):
+        for word_id in self.filter_to_relevant_words(window):
+            self._inverted_index[word_id].add(self._num_docs)
+
 
 class WordOccurrenceAccumulator(TextsAnalyzer):
     """Accumulate word occurrences and co-occurrences from a corpus of texts."""

From b12edefb26fd02cefdfd0863ea53c7104cd510f8 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 25 May 2017 11:02:13 -0400
Subject: [PATCH 08/33] fix bug in WordOccurrenceAccumulator so that
 co-occurrences of same word are interpreted as the occurrence; update tests
 to cover this case; change the p_boolean_sliding_window to use the
 WordOccurrenceAccumulator; minor cleanup in test_coherencemodel

---
 gensim/models/coherencemodel.py               | 14 +++---
 gensim/test/test_coherencemodel.py            | 31 ++++++------
 gensim/test/test_text_analysis.py             | 43 +++++++++++++++++
 .../topic_coherence/probability_estimation.py |  6 +--
 gensim/topic_coherence/text_analysis.py       | 48 ++++++++++++-------
 5 files changed, 101 insertions(+), 41 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 9888dcb3d6..80e3b380d9 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -178,6 +178,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                     t_i.append(dictionary.token2id[topic[n]])
                 self.topics.append(np.array(t_i))
 
+        self._accumulator = None
+
     def __str__(self):
         return coherence_dict[self.coherence].__str__()
 
@@ -206,16 +208,16 @@ def get_coherence_per_topic(self):
         segmented_topics = measure.seg(self.topics)
 
         if self.coherence in boolean_document_based:
-            accumulator = measure.prob(self.corpus, segmented_topics)
-            return measure.conf(segmented_topics, accumulator)
+            self._accumulator = measure.prob(self.corpus, segmented_topics)
+            return measure.conf(segmented_topics, self._accumulator)
 
-        accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                   dictionary=self.dictionary, window_size=self.window_size)
+        self._accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
+                                         dictionary=self.dictionary, window_size=self.window_size)
         if self.coherence == 'c_v':
-            return measure.conf(self.topics, segmented_topics, accumulator, 'nlr', 1)
+            return measure.conf(self.topics, segmented_topics, self._accumulator, 'nlr', 1)
         else:
             normalize = self.coherence == 'c_npmi'
-            return measure.conf(segmented_topics, accumulator, normalize=normalize)
+            return measure.conf(segmented_topics, self._accumulator, normalize=normalize)
 
     def aggregate_measures(self, confirmed_measures):
         measure = coherence_dict[self.coherence]
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index d69aaf0dad..679f115f5b 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -42,17 +42,6 @@ def testfile():
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
 
-def checkCoherenceMeasure(topics1, topics2, coherence):
-    """Check provided topic coherence algorithm on given topics"""
-    if coherence in boolean_document_based:
-        cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence)
-        cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence)
-    else:
-        cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
-        cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence)
-    return cm1.get_coherence() > cm2.get_coherence()
-
-
 class TestCoherenceModel(unittest.TestCase):
     def setUp(self):
         # Suppose given below are the topics which two different LdaModels come up with.
@@ -77,21 +66,33 @@ def setUp(self):
             self.vw_path = vw_path
             self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0)
 
+    def check_coherence_measure(self, coherence):
+        """Check provided topic coherence algorithm on given topics"""
+        if coherence in boolean_document_based:
+            kwargs = dict(corpus=corpus, dictionary=dictionary, coherence=coherence)
+            cm1 = CoherenceModel(topics=self.topics1, **kwargs)
+            cm2 = CoherenceModel(topics=self.topics2, **kwargs)
+        else:
+            kwargs = dict(texts=texts, dictionary=dictionary, coherence=coherence)
+            cm1 = CoherenceModel(topics=self.topics1, **kwargs)
+            cm2 = CoherenceModel(topics=self.topics2, **kwargs)
+        self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
+
     def testUMass(self):
         """Test U_Mass topic coherence algorithm on given topics"""
-        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'u_mass'))
+        self.check_coherence_measure('u_mass')
 
     def testCv(self):
         """Test C_v topic coherence algorithm on given topics"""
-        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_v'))
+        self.check_coherence_measure('c_v')
 
     def testCuci(self):
         """Test C_uci topic coherence algorithm on given topics"""
-        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_uci'))
+        self.check_coherence_measure('c_uci')
 
     def testCnpmi(self):
         """Test C_npmi topic coherence algorithm on given topics"""
-        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_npmi'))
+        self.check_coherence_measure('c_npmi')
 
     def testUMassLdaModel(self):
         """Perform sanity check to see if u_mass coherence works with LDA Model"""
diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index 27eecbc645..33a269f9d2 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -4,6 +4,7 @@
 
 from gensim.topic_coherence.text_analysis import \
     InvertedIndexAccumulator, WordOccurrenceAccumulator
+from gensim.corpora.dictionary import Dictionary
 
 
 class BaseTestCases(object):
@@ -25,6 +26,20 @@ class TextAnalyzerTestBase(unittest.TestCase):
         dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
         top_ids = set(token2id.values())
 
+        texts2 = [['human', 'interface', 'computer'],
+                  ['survey', 'user', 'computer', 'system', 'response', 'time'],
+                  ['eps', 'user', 'interface', 'system'],
+                  ['system', 'human', 'system', 'eps'],
+                  ['user', 'response', 'time'],
+                  ['trees'],
+                  ['graph', 'trees'],
+                  ['graph', 'minors', 'trees'],
+                  ['graph', 'minors', 'survey'],
+                  ['user', 'user']]
+        dictionary2 = Dictionary(texts2)
+        dictionary2.id2token = {v: k for k, v in dictionary2.token2id.items()}
+        top_ids2 = set(dictionary2.token2id.values())
+
         accumulator_cls = None
 
         def test_occurrence_counting(self):
@@ -37,6 +52,34 @@ def test_occurrence_counting(self):
             self.assertEqual(2, accumulator.get_co_occurrences("test", "document"))
             self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))
 
+        def test_occurrence_counting2(self):
+            accumulator = self.accumulator_cls(self.top_ids2, self.dictionary2) \
+                .accumulate(self.texts2, 110)
+            self.assertEqual(2, accumulator.get_occurrences("human"))
+            self.assertEqual(4, accumulator.get_occurrences("user"))
+            self.assertEqual(3, accumulator.get_occurrences("graph"))
+            self.assertEqual(3, accumulator.get_occurrences("trees"))
+
+            cases = [
+                (1, ("human", "interface")),
+                (2, ("system", "user")),
+                (2, ("graph", "minors")),
+                (2, ("graph", "trees")),
+                (4, ("user", "user")),
+                (3, ("graph", "graph")),
+                (0, ("time", "eps"))
+            ]
+            for expected_count, (word1, word2) in cases:
+                # Verify co-occurrence counts are correct, regardless of word order.
+                self.assertEqual(expected_count, accumulator.get_co_occurrences(word1, word2))
+                self.assertEqual(expected_count, accumulator.get_co_occurrences(word2, word1))
+
+                # Also verify that using token ids instead of tokens works the same.
+                word_id1 = self.dictionary2.token2id[word1]
+                word_id2 = self.dictionary2.token2id[word2]
+                self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id1, word_id2))
+                self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1))
+
         def test_occurences_for_irrelevant_words(self):
             accumulator = WordOccurrenceAccumulator(self.top_ids, self.dictionary) \
                 .accumulate(self.texts, 2)
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index d9982ca409..f406e5a3e7 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -13,7 +13,7 @@
 
 import numpy as np
 
-from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator, CorpusAnalyzer
+from gensim.topic_coherence.text_analysis import CorpusAccumulator, WordOccurrenceAccumulator
 
 logger = logging.getLogger(__name__)
 
@@ -50,7 +50,7 @@ def p_boolean_document(corpus, segmented_topics):
     num_docs : Total number of documents in corpus.
     """
     top_ids = _ret_top_ids(segmented_topics)
-    accumulator = CorpusAnalyzer(top_ids).accumulate(corpus)
+    accumulator = CorpusAccumulator(top_ids).accumulate(corpus)
     return accumulator
 
 
@@ -74,5 +74,5 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     window_id[0] : Total no of windows
     """
     top_ids = _ret_top_ids(segmented_topics)
-    return InvertedIndexAccumulator(top_ids, dictionary)\
+    return WordOccurrenceAccumulator(top_ids, dictionary)\
         .accumulate(texts, window_size)
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index f175cbe21a..a7ab9b815b 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -20,8 +20,13 @@
 def _ids_to_words(ids, dictionary):
     """Convert an iterable of ids to their corresponding words using a dictionary.
     This function abstracts away the differences between the HashDictionary and the standard one.
+    
+    Args:
+    ----
+    ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids).
+         This is the format returned by the topic_coherence.segmentation functions.
     """
-    if not dictionary.id2token:
+    if not dictionary.id2token:  # may not be initialized in the standard gensim.corpora.Dictionary
         setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()})
 
     top_words = set()
@@ -72,7 +77,9 @@ def _get_co_occurrences(self, word_id1, word_id2):
 
 
 class UsesDictionary(BaseAnalyzer):
-    """Base class for corpus and text analyzers."""
+    """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts.
+    The standard BaseAnalyzer can only deal with token ids since it does not have access to the token2id mapping.
+    """
 
     def __init__(self, relevant_ids, dictionary):
         super(UsesDictionary, self).__init__(relevant_ids)
@@ -90,17 +97,18 @@ def get_occurrences(self, word):
             word_id = word
         return self._get_occurrences(self.id2contiguous[word_id])
 
-    def get_co_occurrences(self, word1, word2):
-        """Return number of docs the words co-occur in, once `accumulate` has been called."""
+    def _word2_contiguous_id(self, word):
         try:
-            word_id1 = self.token2id[word1]
-        except KeyError:
-            word_id1 = word1
-        try:
-            word_id2 = self.token2id[word2]
+            word_id = self.token2id[word]
         except KeyError:
-            word_id2 = word2
-        return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2])
+            word_id = word
+        return self.id2contiguous[word_id]
+
+    def get_co_occurrences(self, word1, word2):
+        """Return number of docs the words co-occur in, once `accumulate` has been called."""
+        word_id1 = self._word2_contiguous_id(word1)
+        word_id2 = self._word2_contiguous_id(word2)
+        return self._get_co_occurrences(word_id1, word_id2)
 
 
 class InvertedIndexBased(BaseAnalyzer):
@@ -124,7 +132,7 @@ def index_to_dict(self):
         return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
 
 
-class CorpusAnalyzer(InvertedIndexBased):
+class CorpusAccumulator(InvertedIndexBased):
     """Gather word occurrence stats from a corpus by iterating over its BoW representation."""
 
     def analyze_text(self, text):
@@ -192,15 +200,21 @@ def __init__(self, *args):
 
     def analyze_text(self, window):
         relevant_words = list(self.filter_to_relevant_words(window))
-        uniq_words = np.array(relevant_words)
-        self._occurrences[uniq_words] += 1
+        if relevant_words:
+            uniq_words = np.array(relevant_words)
+            self._occurrences[uniq_words] += 1
 
-        for combo in itertools.combinations(relevant_words, 2):
-            self._co_occurrences[combo] += 1
+            for combo in itertools.combinations(relevant_words, 2):
+                self._co_occurrences[combo] += 1
 
     def _symmetrize(self):
+        """Word pairs may have been encountered in (i, j) and (j, i) order.
+        Rather than enforcing a particular ordering during the update process,
+        we choose to symmetrize the co-occurrence matrix after accumulation has completed.
+        """
         co_occ = self._co_occurrences
-        return co_occ + co_occ.T - np.diag(co_occ.diagonal())
+        co_occ.setdiag(self._occurrences)  # diagonal should be equal to occurrence counts
+        self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32')
 
     def accumulate(self, texts, window_size):
         super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)

From 91b8a05e10686585e7c29af9b09af3572e00d469 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 25 May 2017 14:39:03 -0400
Subject: [PATCH 09/33] make wikicorpus parsing handle KeyboardInterrupt
 gracefully

---
 gensim/corpora/wikicorpus.py | 37 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index fb402da517..2d9b598a71 100755
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -23,6 +23,7 @@
 import re
 from xml.etree.cElementTree import iterparse  # LXML isn't faster, so let's go with the built-in solution
 import multiprocessing
+import signal
 
 from gensim import utils
 
@@ -249,6 +250,10 @@ def process_article(args):
     return result, title, pageid
 
 
+def init_worker():
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
 class WikiCorpus(TextCorpus):
     """
     Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
@@ -300,22 +305,26 @@ def get_texts(self):
         articles, articles_all = 0, 0
         positions, positions_all = 0, 0
         texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
-        pool = multiprocessing.Pool(self.processes)
+        pool = multiprocessing.Pool(self.processes, init_worker)
         # process the corpus in smaller chunks of docs, because multiprocessing.Pool
         # is dumb and would load the entire input into RAM at once...
-        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
-            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
-                articles_all += 1
-                positions_all += len(tokens)
-                # article redirects and short stubs are pruned here
-                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
-                    continue
-                articles += 1
-                positions += len(tokens)
-                if self.metadata:
-                    yield (tokens, (pageid, title))
-                else:
-                    yield tokens
+        try:
+            for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
+                for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
+                    articles_all += 1
+                    positions_all += len(tokens)
+                    # article redirects and short stubs are pruned here
+                    if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
+                        continue
+                    articles += 1
+                    positions += len(tokens)
+                    if self.metadata:
+                        yield (tokens, (pageid, title))
+                    else:
+                        yield tokens
+        except KeyboardInterrupt:
+            pass
+
         pool.terminate()
 
         logger.info(

From c6224b7d6d02b069dd9f2731ee2de01e50719257 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Fri, 26 May 2017 19:23:04 -0400
Subject: [PATCH 10/33] add ParallelWordOccurrenceAccumulator and make default
 method for p_boolean_sliding_window; add parameter for CoherenceModel to
 adjust number of processes used, with default equal to max(1, cpu_count - 1)

---
 gensim/models/coherencemodel.py               |   7 +-
 gensim/test/test_text_analysis.py             |  30 +++-
 .../topic_coherence/probability_estimation.py |  16 +-
 gensim/topic_coherence/text_analysis.py       | 158 +++++++++++++++++-
 gensim/utils.py                               |  10 +-
 5 files changed, 200 insertions(+), 21 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 80e3b380d9..d0ff707457 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -20,6 +20,7 @@
 
 import logging
 from collections import namedtuple
+import multiprocessing as mp
 
 import numpy as np
 
@@ -90,7 +91,7 @@ class CoherenceModel(interfaces.TransformationABC):
     Model persistency is achieved via its load/save methods.
     """
     def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None,
-                 coherence='c_v', topn=10):
+                 coherence='c_v', topn=10, processes=-1):
         """
         Args:
         ----
@@ -178,6 +179,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                     t_i.append(dictionary.token2id[topic[n]])
                 self.topics.append(np.array(t_i))
 
+        self.processes = processes if processes > 1 else max(1, mp.cpu_count() - 1)
         self._accumulator = None
 
     def __str__(self):
@@ -212,7 +214,8 @@ def get_coherence_per_topic(self):
             return measure.conf(segmented_topics, self._accumulator)
 
         self._accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                         dictionary=self.dictionary, window_size=self.window_size)
+                                         dictionary=self.dictionary, window_size=self.window_size,
+                                         processes=self.processes)
         if self.coherence == 'c_v':
             return measure.conf(self.topics, segmented_topics, self._accumulator, 'nlr', 1)
         else:
diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index 33a269f9d2..8ee08a2373 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -1,9 +1,8 @@
 import logging
 import unittest
-from collections import namedtuple
 
 from gensim.topic_coherence.text_analysis import \
-    InvertedIndexAccumulator, WordOccurrenceAccumulator
+    InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
 from gensim.corpora.dictionary import Dictionary
 
 
@@ -22,8 +21,9 @@ class TextAnalyzerTestBase(unittest.TestCase):
             'test': 21,
             'document': 17
         }
-        id2token = {v: k for k, v in token2id.items()}
-        dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+        dictionary = Dictionary(texts)
+        dictionary.token2id = token2id
+        dictionary.id2token = {v: k for k, v in token2id.items()}
         top_ids = set(token2id.values())
 
         texts2 = [['human', 'interface', 'computer'],
@@ -42,8 +42,14 @@ class TextAnalyzerTestBase(unittest.TestCase):
 
         accumulator_cls = None
 
+        def init_accumulator(self):
+            return self.accumulator_cls(self.top_ids, self.dictionary)
+
+        def init_accumulator2(self):
+            return self.accumulator_cls(self.top_ids2, self.dictionary2)
+
         def test_occurrence_counting(self):
-            accumulator = self.accumulator_cls(self.top_ids, self.dictionary) \
+            accumulator = self.init_accumulator()\
                 .accumulate(self.texts, 3)
             self.assertEqual(2, accumulator.get_occurrences("this"))
             self.assertEqual(1, accumulator.get_occurrences("is"))
@@ -53,7 +59,7 @@ def test_occurrence_counting(self):
             self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))
 
         def test_occurrence_counting2(self):
-            accumulator = self.accumulator_cls(self.top_ids2, self.dictionary2) \
+            accumulator = self.init_accumulator2()\
                 .accumulate(self.texts2, 110)
             self.assertEqual(2, accumulator.get_occurrences("human"))
             self.assertEqual(4, accumulator.get_occurrences("user"))
@@ -81,7 +87,7 @@ def test_occurrence_counting2(self):
                 self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1))
 
         def test_occurences_for_irrelevant_words(self):
-            accumulator = WordOccurrenceAccumulator(self.top_ids, self.dictionary) \
+            accumulator = self.init_accumulator() \
                 .accumulate(self.texts, 2)
             with self.assertRaises(KeyError):
                 accumulator.get_occurrences("irrelevant")
@@ -125,6 +131,16 @@ class TestWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase):
     accumulator_cls = WordOccurrenceAccumulator
 
 
+class TestParallelWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase):
+    accumulator_cls = ParallelWordOccurrenceAccumulator
+
+    def init_accumulator(self):
+        return self.accumulator_cls(2, self.top_ids, self.dictionary)
+
+    def init_accumulator2(self):
+        return self.accumulator_cls(2, self.top_ids2, self.dictionary2)
+
+
 if __name__ == '__main__':
     logging.root.setLevel(logging.WARNING)
     unittest.main()
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index f406e5a3e7..604fa07a24 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -13,7 +13,8 @@
 
 import numpy as np
 
-from gensim.topic_coherence.text_analysis import CorpusAccumulator, WordOccurrenceAccumulator
+from gensim.topic_coherence.text_analysis import \
+    CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
 
 logger = logging.getLogger(__name__)
 
@@ -50,11 +51,10 @@ def p_boolean_document(corpus, segmented_topics):
     num_docs : Total number of documents in corpus.
     """
     top_ids = _ret_top_ids(segmented_topics)
-    accumulator = CorpusAccumulator(top_ids).accumulate(corpus)
-    return accumulator
+    return CorpusAccumulator(top_ids).accumulate(corpus)
 
 
-def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
+def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1):
     """
     This function performs the boolean sliding window probability estimation. Boolean sliding window
     determines word counts using a sliding window. The window moves over the documents one word token per step.
@@ -74,5 +74,9 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     window_id[0] : Total no of windows
     """
     top_ids = _ret_top_ids(segmented_topics)
-    return WordOccurrenceAccumulator(top_ids, dictionary)\
-        .accumulate(texts, window_size)
+    if processes <= 1:
+        accumulator = WordOccurrenceAccumulator(top_ids, dictionary)
+    else:
+        accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary)
+    logger.info("using %s to estimate probabilities from sliding windows" % accumulator)
+    return accumulator.accumulate(texts, window_size)
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index a7ab9b815b..a9265347a3 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -9,13 +9,18 @@
 statistical information about word occurrences.
 """
 
+import sys
 import itertools
+import logging
+import multiprocessing as mp
 
 import numpy as np
 import scipy.sparse as sps
 
 from gensim import utils
 
+logger = logging.getLogger(__name__)
+
 
 def _ids_to_words(ids, dictionary):
     """Convert an iterable of ids to their corresponding words using a dictionary.
@@ -46,12 +51,20 @@ class BaseAnalyzer(object):
     def __init__(self, relevant_ids):
         self.relevant_ids = relevant_ids
         self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)}
+        self.log_every = 1000
         self._num_docs = 0
 
     @property
     def num_docs(self):
         return self._num_docs
 
+    @num_docs.setter
+    def num_docs(self, num):
+        self._num_docs = num
+        if self._num_docs % self.log_every == 0:
+            logger.info("%s accumulated stats from %d documents" % (
+                self.__class__.__name__, self._num_docs))
+
     def analyze_text(self, text):
         raise NotImplementedError("Base classes should implement analyze_text.")
 
@@ -84,6 +97,7 @@ class UsesDictionary(BaseAnalyzer):
     def __init__(self, relevant_ids, dictionary):
         super(UsesDictionary, self).__init__(relevant_ids)
         self.relevant_words = _ids_to_words(self.relevant_ids, dictionary)
+        self.dictionary = dictionary
         self.token2id = dictionary.token2id
 
     def analyze_text(self, text):
@@ -177,7 +191,7 @@ def accumulate(self, texts, window_size):
         relevant_texts = (text for text in texts if self.text_is_relevant(text))
         for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False):
             self.analyze_text(virtual_document)
-            self._num_docs += 1
+            self.num_docs += 1
         return self
 
 
@@ -198,6 +212,9 @@ def __init__(self, *args):
         self._occurrences = np.zeros(vocab_size, dtype='uint32')
         self._co_occurrences = sps.lil_matrix((vocab_size, vocab_size), dtype='uint32')
 
+    def __str__(self):
+        return self.__class__.__name__
+
     def analyze_text(self, window):
         relevant_words = list(self.filter_to_relevant_words(window))
         if relevant_words:
@@ -217,6 +234,7 @@ def _symmetrize(self):
         self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32')
 
     def accumulate(self, texts, window_size):
+        self._co_occurrences = self._co_occurrences.tolil()
         super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
         self._symmetrize()
         return self
@@ -226,3 +244,141 @@ def _get_occurrences(self, word_id):
 
     def _get_co_occurrences(self, word_id1, word_id2):
         return self._co_occurrences[word_id1, word_id2]
+
+    def merge(self, other):
+        self._occurrences += other._occurrences
+        self._co_occurrences += other._co_occurrences
+        self._num_docs += other._num_docs
+
+
+class _WordOccurrenceAccumulator(WordOccurrenceAccumulator):
+    """Monkey patched to avoid symmetrizing co-occurrence matrix after each batch."""
+    def accumulate(self, texts, window_size):
+        TextsAnalyzer.accumulate(self, texts, window_size)
+        return self
+
+
+class ParallelWordOccurrenceAccumulator(TextsAnalyzer):
+    """Accumulate word occurrences in parallel."""
+
+    def __init__(self, processes, *args, **kwargs):
+        super(ParallelWordOccurrenceAccumulator, self).__init__(*args)
+        if processes < 2:
+            raise ValueError("Must have at least 2 processes to run in parallel; got %d" % processes)
+        self.processes = processes
+        self.batch_size = kwargs.get('batch_size', 16)
+
+    def __str__(self):
+        return "%s(processes=%s, batch_size=%s)" % (
+            self.__class__.__name__, self.processes, self.batch_size)
+
+    def accumulate(self, texts, window_size):
+        workers, input_q, output_q = self.start_workers(window_size)
+        try:
+            self.queue_all_texts(input_q, texts, window_size)
+            interrupted = False
+        except KeyboardInterrupt:
+            logger.warn("stats accumulation interrupted; <= %d documents processed" % self._num_docs)
+            interrupted = True
+
+        accumulators = self.terminate_workers(input_q, output_q, workers, interrupted)
+        return self.merge_accumulators(accumulators)
+
+    def start_workers(self, window_size):
+        input_q = mp.Queue(maxsize=self.processes)
+        output_q = mp.Queue()
+        workers = []
+        for _ in range(self.processes):
+            accumulator = _WordOccurrenceAccumulator(self.relevant_ids, self.dictionary)
+            worker = AccumulatingWorker(input_q, output_q, accumulator, window_size)
+            worker.start()
+            workers.append(worker)
+
+        return workers, input_q, output_q
+
+    def yield_batches(self, texts):
+        batch = []
+        for text in texts:
+            batch.append(text)
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+
+        if batch:
+            yield batch
+
+    def queue_all_texts(self, q, texts, window_size):
+        relevant_texts = (text for text in texts if self.text_is_relevant(text))
+        for batch_num, batch in enumerate(self.yield_batches(relevant_texts)):
+            q.put(batch, block=True)
+            before = self._num_docs / self.log_every
+            self._num_docs += sum(len(doc) - window_size + 1 for doc in batch)
+            if before < (self._num_docs / self.log_every):
+                logger.info("submitted %d batches to accumulate stats from %d documents (%d virtual)" % (
+                    batch_num, batch_num * self.batch_size, self._num_docs))
+
+    def terminate_workers(self, input_q, output_q, workers, interrupted=False):
+        if not interrupted:
+            for _ in workers:
+                input_q.put(None, block=True)
+
+        accumulators = []
+        while len(accumulators) != len(workers):
+            accumulators.append(output_q.get())
+        logger.info("%d accumulators retrieved from output queue" % len(accumulators))
+
+        for worker in workers:
+            if worker.is_alive():
+                worker.terminate()
+
+        input_q.close()
+        output_q.close()
+        return accumulators
+
+    def merge_accumulators(self, accumulators):
+        accumulator = accumulators[0]
+        for other_accumulator in accumulators[1:]:
+            accumulator.merge(other_accumulator)
+        accumulator._symmetrize()
+        return accumulator
+
+
+class AccumulatingWorker(mp.Process):
+    """Accumulate stats from texts fed in from queue."""
+
+    def __init__(self, input_q, output_q, accumulator, window_size):
+        super(AccumulatingWorker, self).__init__()
+        self.input_q = input_q
+        self.output_q = output_q
+        self.accumulator = accumulator
+        self.accumulator.log_every = sys.maxint  # avoid logging in workers
+        self.window_size = window_size
+
+    def run(self):
+        try:
+            self._run()
+        except KeyboardInterrupt:
+            logger.info("%s interrupted after processing %d documents" % (
+                self.__class__.__name__, self.accumulator.num_docs))
+        finally:
+            self.reply_to_master()
+
+    def _run(self):
+        batch_num = 0
+        n_docs = 0
+        while True:
+            docs = self.input_q.get(block=True)
+            if docs is None:  # sentinel value
+                break
+
+            self.accumulator.accumulate(docs, self.window_size)
+            n_docs += len(docs)
+            logger.debug("completed batch %d; %d documents processed (%d virtual)" % (
+                batch_num, n_docs, self.accumulator.num_docs))
+            batch_num += 1
+
+    def reply_to_master(self):
+        logger.info("serializing accumulator to return to master...")
+        self.output_q.put(self.accumulator, block=False)
+        logger.info("accumulator serialized")
+
diff --git a/gensim/utils.py b/gensim/utils.py
index 3a191f1a9a..7300b17abd 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -1193,7 +1193,7 @@ def sample_dict(d, n=10, use_random=True):
 def strided_windows(ndarray, window_size):
     """
     Produce a numpy.ndarray of windows, as from a sliding window.
-    
+
     >>> strided_windows(np.arange(5), 2)
     array([[0, 1],
            [1, 2],
@@ -1206,12 +1206,12 @@ def strided_windows(ndarray, window_size):
            [3, 4, 5, 6, 7],
            [4, 5, 6, 7, 8],
            [5, 6, 7, 8, 9]])
-    
+
     Args:
     ----
     ndarray: either a numpy.ndarray or something that can be converted into one.
     window_size: sliding window size.
-    :param window_size: 
+    :param window_size:
     :return: numpy.ndarray of the subsequences produced by sliding a window of the given size over
              the `ndarray`. Since this uses striding, the individual arrays are views rather than
              copies of `ndarray`. Changes to one view modifies the others and the original.
@@ -1232,7 +1232,7 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True):
     """Produce a generator over the given texts using a sliding window of `window_size`.
     The windows produced are views of some subsequence of a text. To use deep copies
     instead, pass `copy=True`.
-    
+
     Args:
     ----
     texts: List of string sentences.
@@ -1240,7 +1240,7 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True):
     copy: False to use views of the texts (default) or True to produce deep copies.
     ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior).
                        If False, the documents below `window_size` will be yielded as the full document.
-        
+
     """
     for document in texts:
         doc_windows = strided_windows(document, window_size)

From f00d389a4f24bc1433023663aa55ecdaeb432530 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Fri, 26 May 2017 21:57:59 -0400
Subject: [PATCH 11/33] clean up, clarify, and optimize the
 indirect_confirmation_measure.cosine_similarity function

---
 gensim/test/test_indirect_confirmation.py     |  25 ++-
 .../indirect_confirmation_measure.py          | 175 +++++++++---------
 2 files changed, 107 insertions(+), 93 deletions(-)

diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py
index 8fca92a34a..6bdc8abe32 100644
--- a/gensim/test/test_indirect_confirmation.py
+++ b/gensim/test/test_indirect_confirmation.py
@@ -12,9 +12,11 @@
 import unittest
 
 from gensim.topic_coherence import indirect_confirmation_measure
+from gensim.topic_coherence import text_analysis
+from gensim.corpora.dictionary import Dictionary
 
 import numpy as np
-from numpy import array
+
 
 class TestIndirectConfirmation(unittest.TestCase):
     def setUp(self):
@@ -22,17 +24,21 @@ def setUp(self):
         # of this module. See the modules for the mathematical formulas
         self.topics = [np.array([1, 2])]
         # Result from s_one_set segmentation:
-        self.segmentation = [[(1, array([1, 2])), (2, array([1, 2]))]]
-        self.posting_list = {1: set([2, 3, 4]), 2: set([3, 5])}
+        self.segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
         self.gamma = 1
         self.measure = 'nlr'
-        self.num_docs = 5
+
+        dictionary = Dictionary()
+        dictionary.id2token = {1: 'fake', 2: 'tokens'}
+        self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
+        self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
+        self.accumulator._num_docs = 5
 
     def testCosineSimilarity(self):
         """Test cosine_similarity()"""
-        obtained = indirect_confirmation_measure.cosine_similarity(self.topics, self.segmentation,
-                                                                   self.posting_list, self.measure,
-                                                                   self.gamma, self.num_docs)
+        obtained = indirect_confirmation_measure.cosine_similarity(
+            self.topics, self.segmentation, self.accumulator, self.measure, self.gamma)
+
         # The steps involved in this calculation are as follows:
         # 1. Take (1, array([1, 2]). Take w' which is 1.
         # 2. Calculate nlr(1, 1), nlr(1, 2). This is our first vector.
@@ -41,8 +47,9 @@ def testCosineSimilarity(self):
         # 5. Find out cosine similarity between these two vectors.
         # 6. Similarly for the second segmentation.
         expected = [0.6230, 0.6230]  # To account for EPSILON approximation
-        self.assertAlmostEqual(obtained[0], expected[0], 4)
-        self.assertAlmostEqual(obtained[1], expected[1], 4)
+        for i in range(len(expected)):
+            self.assertAlmostEqual(obtained[i], expected[i], 4)
+
 
 if __name__ == '__main__':
     logging.root.setLevel(logging.WARNING)
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index c4585ad677..8309e791c8 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -24,54 +24,16 @@
 """
 
 import logging
+import itertools
+
+import scipy.sparse as sps
 import numpy as np
 
 from gensim.topic_coherence import direct_confirmation_measure
-from gensim.matutils import cossim
 
 logger = logging.getLogger(__name__)
 
 
-def _present(w_prime_star, w, w_backtrack):
-    """
-    Internal helper function to return index of (w_prime_star, w) in w_backtrack.
-    Return -1 if not present.
-    """
-    index = -1
-    flag = 0
-    for arr in w_backtrack:
-        index += 1
-        if np.all(w_prime_star == arr[0]) and np.all(w == arr[1]):
-            flag += 1
-            break
-    if not flag:
-        return -1
-    return index
-
-
-def _make_seg(w_prime, w, accumulator, measure, gamma, backtrack):
-    """
-    Internal helper function to return context vectors for segmentations.
-    """
-    context_vectors = {}
-    if isinstance(w_prime, np.ndarray):
-        for w_j in w:
-            for w_i in w_prime:
-                if (w_i, w_j) not in backtrack:
-                    backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], accumulator, measure[1])[0]
-                if w_j not in context_vectors:
-                    context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma
-                else:
-                    context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma
-    else:
-        for w_j in w:
-            if (w_prime, w_j) not in backtrack:
-                backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], accumulator, measure[1])[0]
-            context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
-
-    return context_vectors, backtrack
-
-
 def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma):
     """
     This function calculates the indirect cosine measure. Given context vectors
@@ -88,56 +50,101 @@ def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma):
     ----
     topics : Topics obtained from the trained topic model.
     segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
+    accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module).
     measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
     gamma : Gamma value for computing W', W* vectors.
-    num_docs : Total number of documents in corresponding corpus.
 
     Returns:
     -------
     s_cos_sim : array of cosine similarity of the context vectors for each segmentation
     """
-    if measure == 'nlr':
-        # make normalized log ratio measure tuple
-        measure = (direct_confirmation_measure.log_ratio_measure, True)
-    else:
-        raise ValueError("The direct confirmation measure you entered is not currently supported.")
-    backtrack = {}  # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2).
-    """
-    For backtracking context vectors, we will create a list called w_backtrack to store (w_prime, w) or
-    (w_star, w) tuples and a corresponding list context_vector_backtrack which will create a
-    mapping of (w_prime or w_star, w) ---> context_vector.
-    """
-    w_backtrack = []
-    context_vector_backtrack = []
+    context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)
+
     s_cos_sim = []
-    for top_words, s_i in zip(topics, segmented_topics):
-        for w_prime, w_star in s_i:
-            # Step 1. Check if (w_prime, top_words) tuple in w_backtrack.
-            # Step 2. If yes, return corresponding context vector
-            w_prime_index = _present(w_prime, top_words, w_backtrack)
-            if w_backtrack and w_prime_index != -1:
-                w_prime_context_vectors = context_vector_backtrack[w_prime_index]
-            else:
-                w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, accumulator, measure, gamma, backtrack)
-                backtrack.update(backtrack_i)
-                # Update backtracking lists
-                w_backtrack.append((w_prime, top_words))
-                context_vector_backtrack.append(w_prime_context_vectors)
-
-            # Step 1. Check if (w_star, top_words) tuple in w_backtrack.
-            # Step 2. If yes, check if corresponding w is the same
-            w_star_index = _present(w_star, top_words, w_backtrack)
-            if w_backtrack and w_star_index != -1:
-                w_star_context_vectors = context_vector_backtrack[w_star_index]
-            else:
-                w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, accumulator, measure, gamma, backtrack)
-                backtrack.update(backtrack_i)
-                # Update all backtracking lists
-                w_backtrack.append((w_star, top_words))
-                context_vector_backtrack.append(w_star_context_vectors)
-
-            s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items())
-            s_cos_sim.append(s_cos_sim_i)
+    for topic_words, topic_segments in zip(topics, segmented_topics):
+        topic_words = tuple(topic_words)  # because tuples are hashable
+        for w_prime, w_star in topic_segments:
+            w_prime_cv = context_vectors[w_prime, topic_words]
+            w_star_cv = context_vectors[w_star, topic_words]
+            s_cos_sim.append(_cossim(w_prime_cv, w_star_cv))
 
     return s_cos_sim
+
+
+class ContextVectorComputer(object):
+    """Lazily compute context vectors for topic segments."""
+
+    def __init__(self, measure, topics, accumulator, gamma):
+        if measure == 'nlr':
+            self.similarity = _pair_npmi
+        else:
+            raise ValueError("The direct confirmation measure you entered is not currently supported.")
+
+        self.mapping = _map_to_contiguous(topics)
+        self.vocab_size = len(self.mapping)
+        self.accumulator = accumulator
+        self.gamma = gamma
+        self.sim_cache = {}  # Cache similarities between tokens represented as pairs of word ids, e.g. (1, 2)
+        self.context_vector_cache = {}  # mapping from (segment, topic_words) --> context_vector
+
+    def __getitem__(self, idx):
+        return self.compute_context_vector(*idx)
+
+    def compute_context_vector(self, segment_word_ids, topic_word_ids):
+        """
+        Step 1. Check if (segment_word_ids, topic_word_ids) context vector has been cached.
+        Step 2. If yes, return corresponding context vector, else compute, cache, and return.
+        """
+        key = _key_for_segment(segment_word_ids, topic_word_ids)
+        context_vector = self.context_vector_cache.get(key, None)
+        if context_vector is None:
+            context_vector = self._make_seg(segment_word_ids, topic_word_ids)
+            self.context_vector_cache[key] = context_vector
+        return context_vector
+
+    def _make_seg(self, segment_word_ids, topic_word_ids):
+        """Internal helper function to return context vectors for segmentations."""
+        context_vector = sps.lil_matrix((self.vocab_size, 1))
+        if not hasattr(segment_word_ids, '__iter__'):
+            segment_word_ids = (segment_word_ids,)
+
+        for w_j in topic_word_ids:
+            idx = (self.mapping[w_j], 0)
+            for pair in (tuple(sorted((w_i, w_j))) for w_i in segment_word_ids):
+                if pair not in self.sim_cache:
+                    self.sim_cache[pair] = self.similarity(pair, self.accumulator)
+
+                context_vector[idx] += self.sim_cache[pair] ** self.gamma
+
+        return context_vector.tocsr()
+
+
+def _pair_npmi(pair, accumulator):
+    """Compute normalized pairwise mutual information (NPMI) between a pair of words.
+    The pair is an iterable of (word_id1, word_id2).
+    """
+    return direct_confirmation_measure.log_ratio_measure([[pair]], accumulator, True)[0]
+
+
+def _cossim(cv1, cv2):
+    return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
+
+
+def _magnitude(sparse_vec):
+    return np.sqrt(np.sum(sparse_vec.data ** 2))
+
+
+def _map_to_contiguous(ids_iterable):
+    uniq_ids = {}
+    n = 0
+    for id_ in itertools.chain.from_iterable(ids_iterable):
+        if id_ not in uniq_ids:
+            uniq_ids[id_] = n
+            n += 1
+    return uniq_ids
+
+
+def _key_for_segment(segment, topic_words):
+    """A segment may have a single number of an iterable of them."""
+    segment_key = tuple(segment) if hasattr(segment, '__iter__') else segment
+    return segment_key, topic_words

From 327b7391ab2a0e01646a4009076d75237bffee7a Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 29 May 2017 20:56:14 -0400
Subject: [PATCH 12/33] #1342: Cleanup, documentation improvements, proper
 caching of accumulator in CoherenceModel, and various test fixes.

---
 gensim/models/coherencemodel.py               | 120 +++++++++++++-----
 gensim/test/test_indirect_confirmation.py     |   2 +-
 gensim/test/test_probability_estimation.py    |  22 ++--
 .../indirect_confirmation_measure.py          |   6 +-
 .../topic_coherence/probability_estimation.py |  35 +++--
 gensim/topic_coherence/text_analysis.py       |  83 ++++++++----
 6 files changed, 179 insertions(+), 89 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index d0ff707457..a29eefe5fc 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -31,12 +31,13 @@
 from gensim.topic_coherence import (segmentation, probability_estimation,
                                     direct_confirmation_measure, indirect_confirmation_measure,
                                     aggregation)
+from gensim.topic_coherence.probability_estimation import unique_ids_from_segments
 from gensim.utils import is_corpus, FakeDict
 
 logger = logging.getLogger(__name__)
 
-boolean_document_based = ['u_mass']
-sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
+boolean_document_based = {'u_mass'}
+sliding_window_based = {'c_v', 'c_uci', 'c_npmi'}
 make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
 
 coherence_dict = {
@@ -66,9 +67,7 @@
 
 
 class CoherenceModel(interfaces.TransformationABC):
-    """
-    Objects of this class allow for building and maintaining a model for topic
-    coherence.
+    """Objects of this class allow for building and maintaining a model for topic coherence.
 
     The main methods are:
 
@@ -169,21 +168,57 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
 
         self.topn = topn
         self.model = model
-        if model is not None:
-            self.topics = self._get_topics()
-        elif topics is not None:
-            self.topics = []
-            for topic in topics:
-                t_i = []
-                for n, _ in enumerate(topic):
-                    t_i.append(dictionary.token2id[topic[n]])
-                self.topics.append(np.array(t_i))
 
-        self.processes = processes if processes > 1 else max(1, mp.cpu_count() - 1)
         self._accumulator = None
+        self._topics = None
+        self.topics = topics
+
+        self.processes = processes if processes > 1 else max(1, mp.cpu_count() - 1)
 
     def __str__(self):
-        return coherence_dict[self.coherence].__str__()
+        return str(self.measure)
+
+    @property
+    def measure(self):
+        return coherence_dict[self.coherence]
+
+    @property
+    def topics(self):
+        return self._topics
+
+    @topics.setter
+    def topics(self, topics):
+        new_topics = None
+        if self.model is not None:
+            new_topics = self._get_topics()
+            if topics is not None:
+                logger.warn("Ignoring topics you are attempting to set in favor of model's topics: %s" % self.model)
+        elif topics is not None:
+            new_topics = []
+            for topic in topics:
+                t_i = np.array([self.dictionary.token2id[topic[n]] for n, _ in enumerate(topic)])
+                new_topics.append(np.array(t_i))
+
+        if self._relevant_ids_will_differ(new_topics):
+            logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
+            self._accumulator = None
+
+        self._topics = new_topics
+
+    def _relevant_ids_will_differ(self, new_topics):
+        if not self._topics_differ(new_topics):
+            return False
+
+        measure = self.measure
+        current_set = unique_ids_from_segments(measure.seg(self.topics))
+        new_set = unique_ids_from_segments(measure.seg(new_topics))
+        return not current_set.issuperset(new_set)
+
+    def _topics_differ(self, new_topics):
+        return (new_topics is not None and
+                self._topics is not None and
+                self._accumulator is not None and
+                not np.equal(new_topics, self._topics).all())
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
@@ -205,26 +240,49 @@ def _get_topics(self):
                              "LdaModel, LdaVowpalWabbit and LdaMallet.")
         return topics
 
-    def get_coherence_per_topic(self):
-        measure = coherence_dict[self.coherence]
-        segmented_topics = measure.seg(self.topics)
+    def segment_topics(self):
+        return self.measure.seg(self.topics)
+
+    def estimate_probabilities(self, segmented_topics=None):
+        """Accumulate word occurrences and co-occurrences from texts or corpus using
+        the optimal method for the chosen coherence metric. This operation may take
+        quite some time for the sliding window based coherence methods.
+        """
+        if segmented_topics is None:
+            segmented_topics = self.segment_topics()
 
         if self.coherence in boolean_document_based:
-            self._accumulator = measure.prob(self.corpus, segmented_topics)
-            return measure.conf(segmented_topics, self._accumulator)
-
-        self._accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                         dictionary=self.dictionary, window_size=self.window_size,
-                                         processes=self.processes)
-        if self.coherence == 'c_v':
-            return measure.conf(self.topics, segmented_topics, self._accumulator, 'nlr', 1)
+            self._accumulator = self.measure.prob(self.corpus, segmented_topics)
         else:
-            normalize = self.coherence == 'c_npmi'
-            return measure.conf(segmented_topics, self._accumulator, normalize=normalize)
+            self._accumulator = self.measure.prob(
+                texts=self.texts, segmented_topics=segmented_topics,
+                dictionary=self.dictionary, window_size=self.window_size,
+                processes=self.processes)
+
+        return self._accumulator
+
+    def get_coherence_per_topic(self, segmented_topics=None):
+        """Return list of coherence values for each topic based on pipeline parameters."""
+        measure = self.measure
+        if segmented_topics is None:
+            segmented_topics = measure.seg(self.topics)
+        if self._accumulator is None:
+            self.estimate_probabilities(segmented_topics)
+
+        if self.coherence in boolean_document_based:
+            kwargs = {}
+        elif self.coherence == 'c_v':
+            kwargs = dict(topics=self.topics, measure='nlr', gamma=1)
+        else:
+            kwargs = dict(normalize=(self.coherence == 'c_npmi'))
+
+        return measure.conf(segmented_topics, self._accumulator, **kwargs)
 
     def aggregate_measures(self, confirmed_measures):
-        measure = coherence_dict[self.coherence]
-        return measure.aggr(confirmed_measures)
+        """Aggregate the individual topic coherence measures using
+        the pipeline's aggregation function.
+        """
+        return self.measure.aggr(confirmed_measures)
 
     def get_coherence(self):
         """Return coherence value based on pipeline parameters."""
diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py
index 6bdc8abe32..aedd9eaa9a 100644
--- a/gensim/test/test_indirect_confirmation.py
+++ b/gensim/test/test_indirect_confirmation.py
@@ -37,7 +37,7 @@ def setUp(self):
     def testCosineSimilarity(self):
         """Test cosine_similarity()"""
         obtained = indirect_confirmation_measure.cosine_similarity(
-            self.topics, self.segmentation, self.accumulator, self.measure, self.gamma)
+            self.segmentation, self.accumulator, self.topics, self.measure, self.gamma)
 
         # The steps involved in this calculation are as follows:
         # 1. Take (1, array([1, 2]). Take w' which is 1.
diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py
index 68ac24e752..f87b7bc564 100644
--- a/gensim/test/test_probability_estimation.py
+++ b/gensim/test/test_probability_estimation.py
@@ -56,17 +56,20 @@ def setUp(self):
     def testPBooleanDocument(self):
         """Test p_boolean_document()"""
         # Unique topic ids are 5798, 10608, 12736 and 18451
-        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
+        accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
+        obtained = accumulator.index_to_dict()
         expected = {18451: {5}, 12736: {1, 3}, 5798: {1, 2}, 10608: {0}}
         self.assertEqual(expected, obtained)
 
     def testPBooleanSlidingWindow(self):
         """Test p_boolean_sliding_window()"""
         # Test with window size as 2. window_id is zero indexed.
-        obtained, _ = probability_estimation.p_boolean_sliding_window(
+        accumulator = probability_estimation.p_boolean_sliding_window(
             self.texts, self.segmented_topics, self.dictionary, 2)
-        expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {10}, 5798: {4, 5, 6, 7}}
-        self.assertEqual(expected, obtained)
+        self.assertEqual(1, accumulator[10608])
+        self.assertEqual(3, accumulator[12736])
+        self.assertEqual(1, accumulator[18451])
+        self.assertEqual(4, accumulator[5798])
 
 
 class TestProbabilityEstimationWithNormalDictionary(ProbabilityEstimationBase):
@@ -100,17 +103,20 @@ def setUp(self):
 
     def testPBooleanDocument(self):
         """Test p_boolean_document()"""
-        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
+        accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
+        obtained = accumulator.index_to_dict()
         expected = {9: {5}, 3: {1, 3}, 4: {1, 2}, 1: {0}}
         self.assertEqual(expected, obtained)
 
     def testPBooleanSlidingWindow(self):
         """Test p_boolean_sliding_window()"""
         # Test with window size as 2. window_id is zero indexed.
-        obtained, _ = probability_estimation.p_boolean_sliding_window(
+        accumulator = probability_estimation.p_boolean_sliding_window(
             self.texts, self.segmented_topics, self.dictionary, 2)
-        expected = {1: {1}, 3: {8, 2, 3}, 9: {10}, 4: {4, 5, 6, 7}}
-        self.assertEqual(expected, obtained)
+        self.assertEqual(1, accumulator[1])
+        self.assertEqual(3, accumulator[3])
+        self.assertEqual(1, accumulator[9])
+        self.assertEqual(4, accumulator[4])
 
 
 if __name__ == '__main__':
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index 8309e791c8..eccfb0a3b5 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -34,7 +34,7 @@
 logger = logging.getLogger(__name__)
 
 
-def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma):
+def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1):
     """
     This function calculates the indirect cosine measure. Given context vectors
     _   _         _   _
@@ -48,11 +48,11 @@ def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma):
 
     Args:
     ----
-    topics : Topics obtained from the trained topic model.
     segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
     accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module).
+    topics : Topics obtained from the trained topic model.
     measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
-    gamma : Gamma value for computing W', W* vectors.
+    gamma : Gamma value for computing W', W* vectors; default is 1.
 
     Returns:
     -------
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index 604fa07a24..fb583b99fc 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -11,30 +11,12 @@
 import logging
 import itertools
 
-import numpy as np
-
 from gensim.topic_coherence.text_analysis import \
     CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
 
 logger = logging.getLogger(__name__)
 
 
-def _ret_top_ids(segmented_topics):
-    """
-    Helper function to return a set of all the unique topic ids in segmented topics.
-    """
-    top_ids = set()  # is a set of all the unique ids contained in topics.
-    for s_i in segmented_topics:
-        for word_id in itertools.chain.from_iterable(s_i):
-            if isinstance(word_id, np.ndarray):
-                for i in word_id:
-                    top_ids.add(i)
-            else:
-                top_ids.add(word_id)
-
-    return top_ids
-
-
 def p_boolean_document(corpus, segmented_topics):
     """
     This function performs the boolean document probability estimation. Boolean document estimates the probability
@@ -50,7 +32,7 @@ def p_boolean_document(corpus, segmented_topics):
     per_topic_postings : Boolean document posting list for each unique topic id.
     num_docs : Total number of documents in corpus.
     """
-    top_ids = _ret_top_ids(segmented_topics)
+    top_ids = unique_ids_from_segments(segmented_topics)
     return CorpusAccumulator(top_ids).accumulate(corpus)
 
 
@@ -73,10 +55,23 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
     per_topic_postings : Boolean sliding window postings list of all the unique topic ids.
     window_id[0] : Total no of windows
     """
-    top_ids = _ret_top_ids(segmented_topics)
+    top_ids = unique_ids_from_segments(segmented_topics)
     if processes <= 1:
         accumulator = WordOccurrenceAccumulator(top_ids, dictionary)
     else:
         accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary)
     logger.info("using %s to estimate probabilities from sliding windows" % accumulator)
     return accumulator.accumulate(texts, window_size)
+
+
+def unique_ids_from_segments(segmented_topics):
+    """Return the set of all unique ids in a list of segmented topics."""
+    top_ids = set()  # is a set of all the unique ids contained in topics.
+    for s_i in segmented_topics:
+        for word_id in itertools.chain.from_iterable(s_i):
+            if hasattr(word_id, '__iter__'):
+                top_ids = top_ids.union(word_id)
+            else:
+                top_ids.add(word_id)
+
+    return top_ids
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index a9265347a3..180d378e4b 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -25,7 +25,7 @@
 def _ids_to_words(ids, dictionary):
     """Convert an iterable of ids to their corresponding words using a dictionary.
     This function abstracts away the differences between the HashDictionary and the standard one.
-    
+
     Args:
     ----
     ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids).
@@ -159,20 +159,21 @@ def analyze_text(self, text):
     def accumulate(self, corpus):
         for document in corpus:
             self.analyze_text(document)
-            self._num_docs += 1
+            self.num_docs += 1
         return self
 
 
-class TextsAnalyzer(UsesDictionary):
-    """Gather some statistics about relevant terms a corpus by iterating over texts."""
+class WindowedTextsAnalyzer(UsesDictionary):
+    """Gather some statistics about relevant terms of a corpus by iterating over windows of texts."""
 
     def __init__(self, relevant_ids, dictionary):
         """
         Args:
         ----
-        relevant_words: the set of words that occurrences should be accumulated for.
+        relevant_ids: the set of words that occurrences should be accumulated for.
+        dictionary: gensim.corpora.dictionary.Dictionary instance with mappings for the relevant_ids.
         """
-        super(TextsAnalyzer, self).__init__(relevant_ids, dictionary)
+        super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary)
 
     def filter_to_relevant_words(self, text):
         """Lazily filter the text to only those words which are relevant."""
@@ -195,7 +196,7 @@ def accumulate(self, texts, window_size):
         return self
 
 
-class InvertedIndexAccumulator(TextsAnalyzer, InvertedIndexBased):
+class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased):
     """Build an inverted index from a sequence of corpus texts."""
 
     def analyze_text(self, window):
@@ -203,8 +204,8 @@ def analyze_text(self, window):
             self._inverted_index[word_id].add(self._num_docs)
 
 
-class WordOccurrenceAccumulator(TextsAnalyzer):
-    """Accumulate word occurrences and co-occurrences from a corpus of texts."""
+class WordOccurrenceAccumulator(WindowedTextsAnalyzer):
+    """Accumulate word occurrences and co-occurrences from a sequence of corpus texts."""
 
     def __init__(self, *args):
         super(WordOccurrenceAccumulator, self).__init__(*args)
@@ -224,6 +225,20 @@ def analyze_text(self, window):
             for combo in itertools.combinations(relevant_words, 2):
                 self._co_occurrences[combo] += 1
 
+    def accumulate(self, texts, window_size):
+        self._co_occurrences = self._co_occurrences.tolil()
+        self.partial_accumulate(texts, window_size)
+        self._symmetrize()
+        return self
+
+    def partial_accumulate(self, texts, window_size):
+        """Meant to be called several times to accumulate partial results. The final
+        accumulation should be performed with the `accumulate` method as opposed to this one.
+        This method does not ensure the co-occurrence matrix is in lil format and does not
+        symmetrize it after accumulation.
+        """
+        super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
+
     def _symmetrize(self):
         """Word pairs may have been encountered in (i, j) and (j, i) order.
         Rather than enforcing a particular ordering during the update process,
@@ -233,12 +248,6 @@ def _symmetrize(self):
         co_occ.setdiag(self._occurrences)  # diagonal should be equal to occurrence counts
         self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32')
 
-    def accumulate(self, texts, window_size):
-        self._co_occurrences = self._co_occurrences.tolil()
-        super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
-        self._symmetrize()
-        return self
-
     def _get_occurrences(self, word_id):
         return self._occurrences[word_id]
 
@@ -251,14 +260,7 @@ def merge(self, other):
         self._num_docs += other._num_docs
 
 
-class _WordOccurrenceAccumulator(WordOccurrenceAccumulator):
-    """Monkey patched to avoid symmetrizing co-occurrence matrix after each batch."""
-    def accumulate(self, texts, window_size):
-        TextsAnalyzer.accumulate(self, texts, window_size)
-        return self
-
-
-class ParallelWordOccurrenceAccumulator(TextsAnalyzer):
+class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer):
     """Accumulate word occurrences in parallel."""
 
     def __init__(self, processes, *args, **kwargs):
@@ -285,11 +287,17 @@ def accumulate(self, texts, window_size):
         return self.merge_accumulators(accumulators)
 
     def start_workers(self, window_size):
+        """Set up an input and output queue and start processes for each worker.
+        
+        The input queue is used to transmit batches of documents to the workers.
+        The output queue is used by workers to transmit the WordOccurrenceAccumulator instances.
+        Returns: tuple of (list of workers, input queue, output queue).
+        """
         input_q = mp.Queue(maxsize=self.processes)
         output_q = mp.Queue()
         workers = []
         for _ in range(self.processes):
-            accumulator = _WordOccurrenceAccumulator(self.relevant_ids, self.dictionary)
+            accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary)
             worker = AccumulatingWorker(input_q, output_q, accumulator, window_size)
             worker.start()
             workers.append(worker)
@@ -297,6 +305,9 @@ def start_workers(self, window_size):
         return workers, input_q, output_q
 
     def yield_batches(self, texts):
+        """Return a generator over the given texts that yields batches of
+        `batch_size` texts at a time.
+        """
         batch = []
         for text in texts:
             batch.append(text)
@@ -308,6 +319,9 @@ def yield_batches(self, texts):
             yield batch
 
     def queue_all_texts(self, q, texts, window_size):
+        """Sequentially place batches of texts on the given queue until `texts` is consumed.
+        The texts are filtered so that only those with at least one relevant token are queued.
+        """
         relevant_texts = (text for text in texts if self.text_is_relevant(text))
         for batch_num, batch in enumerate(self.yield_batches(relevant_texts)):
             q.put(batch, block=True)
@@ -318,6 +332,18 @@ def queue_all_texts(self, q, texts, window_size):
                     batch_num, batch_num * self.batch_size, self._num_docs))
 
     def terminate_workers(self, input_q, output_q, workers, interrupted=False):
+        """Wait until all workers have transmitted their WordOccurrenceAccumulator instances,
+        then terminate each. We do not use join here because it has been shown to have some issues
+        in Python 2.7 (and even in later versions). This method also closes both the input and output
+        queue.
+        
+        If `interrupted` is False (normal execution), a None value is placed on the input queue for
+        each worker. The workers are looking for this sentinel value and interpret it as a signal to
+        terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are
+        programmed to recover from this and continue on to transmit their results before terminating.
+        So in this instance, the sentinel values are not queued, but the rest of the execution
+        continues as usual.
+        """
         if not interrupted:
             for _ in workers:
                 input_q.put(None, block=True)
@@ -336,9 +362,15 @@ def terminate_workers(self, input_q, output_q, workers, interrupted=False):
         return accumulators
 
     def merge_accumulators(self, accumulators):
+        """Merge the list of accumulators into a single `WordOccurrenceAccumulator` with all
+        occurrence and co-occurrence counts, and a `num_docs` that reflects the total observed
+        by all the individual accumulators.
+        """
         accumulator = accumulators[0]
         for other_accumulator in accumulators[1:]:
             accumulator.merge(other_accumulator)
+        # Workers perform partial accumulation, so none of the co-occurrence matrices are symmetrized.
+        # This is by design, to avoid unnecessary matrix additions during accumulation.
         accumulator._symmetrize()
         return accumulator
 
@@ -371,7 +403,7 @@ def _run(self):
             if docs is None:  # sentinel value
                 break
 
-            self.accumulator.accumulate(docs, self.window_size)
+            self.accumulator.partial_accumulate(docs, self.window_size)
             n_docs += len(docs)
             logger.debug("completed batch %d; %d documents processed (%d virtual)" % (
                 batch_num, n_docs, self.accumulator.num_docs))
@@ -381,4 +413,3 @@ def reply_to_master(self):
         logger.info("serializing accumulator to return to master...")
         self.output_q.put(self.accumulator, block=False)
         logger.info("accumulator serialized")
-

From e06c7c3c53dcaebf727da89c9f24b0af790a9fce Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Tue, 30 May 2017 05:41:50 -0400
Subject: [PATCH 13/33] #1342: Do not swallow `KeyboardInterrupt` naively in
 `WikiCorpus.get_texts`; instead, log warning and do not set `length`.

---
 gensim/corpora/wikicorpus.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 209946fb41..13b111db4f 100755
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -250,7 +250,8 @@ def process_article(args):
     return result, title, pageid
 
 
-def init_worker():
+def init_to_ignore_interrupt():
+    """Should only be used when master is prepared to handle termination of child processes."""
     signal.signal(signal.SIGINT, signal.SIG_IGN)
 
 
@@ -304,13 +305,16 @@ def get_texts(self):
         """
         articles, articles_all = 0, 0
         positions, positions_all = 0, 0
-        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
-        pool = multiprocessing.Pool(self.processes, init_worker)
-        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
-        # is dumb and would load the entire input into RAM at once...
+        texts = ((text, self.lemmatize, title, pageid)
+                 for title, text, pageid
+                 in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
+        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
+
         try:
+            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
+            # is dumb and would load the entire input into RAM at once...
             for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
-                for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
+                for tokens, title, pageid in pool.imap(process_article, group):
                     articles_all += 1
                     positions_all += len(tokens)
                     # article redirects and short stubs are pruned here
@@ -323,13 +327,15 @@ def get_texts(self):
                     else:
                         yield tokens
         except KeyboardInterrupt:
-            pass
-
-        pool.terminate()
-
-        logger.info(
-            "finished iterating over Wikipedia corpus of %i documents with %i positions"
-            " (total %i articles, %i positions before pruning articles shorter than %i words)",
-            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
-        self.length = articles  # cache corpus length
+            logger.warn("user terminated iteration over Wikipedia corpus after %i documents with %i positions"
+                        " (total %i articles, %i positions before pruning articles shorter than %i words)",
+                        articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+        else:
+            logger.info(
+                "finished iterating over Wikipedia corpus of %i documents with %i positions"
+                " (total %i articles, %i positions before pruning articles shorter than %i words)",
+                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+            self.length = articles  # cache corpus length
+        finally:
+            pool.terminate()
 # endclass WikiCorpus

From 2ca43f7378e962e33d7be4e836444ad2bfbe0117 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Tue, 30 May 2017 05:47:01 -0400
Subject: [PATCH 14/33] #1342: Formatting fixes (hanging indent in
 `coherencemodel` and non-empty blank lines in `text_analysis`.

---
 gensim/models/coherencemodel.py         | 6 +++---
 gensim/topic_coherence/text_analysis.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index a29eefe5fc..cff32fe2c2 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -216,9 +216,9 @@ def _relevant_ids_will_differ(self, new_topics):
 
     def _topics_differ(self, new_topics):
         return (new_topics is not None and
-                self._topics is not None and
-                self._accumulator is not None and
-                not np.equal(new_topics, self._topics).all())
+                    self._topics is not None and
+                    self._accumulator is not None and
+                    not np.equal(new_topics, self._topics).all())
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 180d378e4b..7b12572fb8 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -288,7 +288,7 @@ def accumulate(self, texts, window_size):
 
     def start_workers(self, window_size):
         """Set up an input and output queue and start processes for each worker.
-        
+
         The input queue is used to transmit batches of documents to the workers.
         The output queue is used by workers to transmit the WordOccurrenceAccumulator instances.
         Returns: tuple of (list of workers, input queue, output queue).
@@ -336,7 +336,7 @@ def terminate_workers(self, input_q, output_q, workers, interrupted=False):
         then terminate each. We do not use join here because it has been shown to have some issues
         in Python 2.7 (and even in later versions). This method also closes both the input and output
         queue.
-        
+
         If `interrupted` is False (normal execution), a None value is placed on the input queue for
         each worker. The workers are looking for this sentinel value and interpret it as a signal to
         terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are

From 825b0e9f8f60b1f6c217f54eca3fb213b4e9e80a Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Tue, 30 May 2017 06:03:47 -0400
Subject: [PATCH 15/33] #1342: Improve `CoherenceModel` documentation and minor
 refactor for variable interpretability.

---
 gensim/models/coherencemodel.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index cff32fe2c2..15d680a06c 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -73,6 +73,19 @@ class CoherenceModel(interfaces.TransformationABC):
 
     1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
     2. the ``get_coherence()`` method, which returns the topic coherence.
+    
+    Pipeline phases can also be executed individually. Methods for doing this are:
+    
+    1. `segment_topics()`, which performs segmentation of the given topics into their comparison sets.
+    2. `estimate_probabilities()`, which accumulates word occurrence stats from the given corpus or texts.
+        The output of this is also cached on the `CoherenceModel`, so calling this method can be used as
+        a precomputation step for the next phase.
+    3. `get_coherence_per_topic()`, which uses the segmented topics and estimated probabilities to compute
+        the coherence of each topic. This output can be used to rank topics in order of most coherent to
+        least. Such a ranking is useful if the intended use case of a topic model is document exploration
+        by a human. It is also useful for filtering out incoherent topics (keep top-n from ranked list).
+    4. `aggregate_measures(topic_coherences)`, which uses the pipeline's aggregation method to compute
+        the overall coherence from the topic coherences.
 
     One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly
     provided if the model does not contain a dictionary already::
@@ -108,8 +121,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                              ['graph', 'minors', 'trees'],
                              ['graph', 'minors', 'survey']]
         corpus : Gensim document corpus.
-        dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
-                     If both are provided, dictionary will be used.
+        dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present,
+                     this is not needed. If both are provided, dictionary will be used.
         window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their
                       probability estimator. For 'u_mass' this doesn't matter.
                       If left 'None' the default window sizes are used which are:
@@ -121,9 +134,12 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                     'c_v'
                     'c_uci' also popularly known as c_pmi
                     'c_npmi'
-                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
-                    For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed.
+                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted
+                    to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided.
+                    Corpus is not needed.
         topn : Integer corresponding to the number of top words to be extracted from each topic.
+        processes : number of processes to use for probability estimation phase; any value less than 1 will be
+                    interpreted to mean num_cpus - 1; default is -1.
         """
         if model is None and topics is None:
             raise ValueError("One of model or topics has to be provided.")
@@ -196,8 +212,8 @@ def topics(self, topics):
         elif topics is not None:
             new_topics = []
             for topic in topics:
-                t_i = np.array([self.dictionary.token2id[topic[n]] for n, _ in enumerate(topic)])
-                new_topics.append(np.array(t_i))
+                topic_token_ids = np.array([self.dictionary.token2id[token] for token in topic])
+                new_topics.append(topic_token_ids)
 
         if self._relevant_ids_will_differ(new_topics):
             logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
@@ -278,11 +294,11 @@ def get_coherence_per_topic(self, segmented_topics=None):
 
         return measure.conf(segmented_topics, self._accumulator, **kwargs)
 
-    def aggregate_measures(self, confirmed_measures):
+    def aggregate_measures(self, topic_coherences):
         """Aggregate the individual topic coherence measures using
         the pipeline's aggregation function.
         """
-        return self.measure.aggr(confirmed_measures)
+        return self.measure.aggr(topic_coherences)
 
     def get_coherence(self):
         """Return coherence value based on pipeline parameters."""

From 314a400912ead837e99f2ce30e9be2cbe0381ff9 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Tue, 30 May 2017 17:03:44 -0400
Subject: [PATCH 16/33] #1342: Optimize word occurrence accumulation and fix a
 bug with repeated counting of tokens that occur more than once in a window.

---
 gensim/test/test_text_analysis.py       | 20 ++++---
 gensim/topic_coherence/text_analysis.py | 72 ++++++++++++++++---------
 gensim/utils.py                         | 27 ++++++----
 3 files changed, 77 insertions(+), 42 deletions(-)

diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index 8ee08a2373..ed6d482b44 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -12,7 +12,8 @@ class TextAnalyzerTestBase(unittest.TestCase):
         texts = [
             ['this', 'is', 'a'],
             ['test', 'document'],
-            ['this', 'test', 'document']
+            ['this', 'test', 'document'],
+            ['test', 'test', 'this']
         ]
         token2id = {
             'this': 10,
@@ -51,11 +52,12 @@ def init_accumulator2(self):
         def test_occurrence_counting(self):
             accumulator = self.init_accumulator()\
                 .accumulate(self.texts, 3)
-            self.assertEqual(2, accumulator.get_occurrences("this"))
+            self.assertEqual(3, accumulator.get_occurrences("this"))
             self.assertEqual(1, accumulator.get_occurrences("is"))
             self.assertEqual(1, accumulator.get_occurrences("a"))
 
             self.assertEqual(2, accumulator.get_co_occurrences("test", "document"))
+            self.assertEqual(2, accumulator.get_co_occurrences("test", "this"))
             self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))
 
         def test_occurrence_counting2(self):
@@ -101,13 +103,14 @@ class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase):
     def test_accumulate1(self):
         accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\
             .accumulate(self.texts, 2)
-        # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], ['test', 'document']]
+        # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'],
+        #  ['test', 'document'], ['test', 'test'], ['test', 'this']]
         inverted_index = accumulator.index_to_dict()
         expected = {
-            10: {0, 3},
+            10: {0, 3, 6},
             15: {0, 1},
             20: {1},
-            21: {2, 3, 4},
+            21: {2, 3, 4, 5, 6},
             17: {2, 4}
         }
         self.assertDictEqual(expected, inverted_index)
@@ -115,13 +118,14 @@ def test_accumulate1(self):
     def test_accumulate2(self):
         accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary) \
             .accumulate(self.texts, 3)
-        # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document']]
+        # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document'],
+        #  ['test', 'test', 'this']
         inverted_index = accumulator.index_to_dict()
         expected = {
-            10: {0, 2},
+            10: {0, 2, 3},
             15: {0},
             20: {0},
-            21: {1, 2},
+            21: {1, 2, 3},
             17: {1, 2}
         }
         self.assertDictEqual(expected, inverted_index)
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 7b12572fb8..b2b43e9382 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -65,7 +65,7 @@ def num_docs(self, num):
             logger.info("%s accumulated stats from %d documents" % (
                 self.__class__.__name__, self._num_docs))
 
-    def analyze_text(self, text):
+    def analyze_text(self, text, doc_num=None):
         raise NotImplementedError("Base classes should implement analyze_text.")
 
     def __getitem__(self, word_or_words):
@@ -100,9 +100,6 @@ def __init__(self, relevant_ids, dictionary):
         self.dictionary = dictionary
         self.token2id = dictionary.token2id
 
-    def analyze_text(self, text):
-        raise NotImplementedError("Base classes should implement analyze_text.")
-
     def get_occurrences(self, word):
         """Return number of docs the word occurs in, once `accumulate` has been called."""
         try:
@@ -149,7 +146,7 @@ def index_to_dict(self):
 class CorpusAccumulator(InvertedIndexBased):
     """Gather word occurrence stats from a corpus by iterating over its BoW representation."""
 
-    def analyze_text(self, text):
+    def analyze_text(self, text, doc_num=None):
         doc_words = frozenset(x[0] for x in text)
         top_ids_in_doc = self.relevant_ids.intersection(doc_words)
         if len(top_ids_in_doc) > 0:
@@ -164,7 +161,7 @@ def accumulate(self, corpus):
 
 
 class WindowedTextsAnalyzer(UsesDictionary):
-    """Gather some statistics about relevant terms of a corpus by iterating over windows of texts."""
+    """Gather some stats about relevant terms of a corpus by iterating over windows of texts."""
 
     def __init__(self, relevant_ids, dictionary):
         """
@@ -181,6 +178,22 @@ def filter_to_relevant_words(self, text):
         relevant_ids = (self.token2id[word] for word in relevant_words)
         return (self.id2contiguous[word_id] for word_id in relevant_ids)
 
+    def accumulate(self, texts, window_size):
+        relevant_texts = self._iter_texts(texts)
+        windows = utils.iter_windows(relevant_texts, window_size, ignore_below_size=False,
+                                     include_doc_num=True)
+        for doc_num, virtual_document in windows:
+            self.analyze_text(virtual_document, doc_num)
+            self.num_docs += 1
+        return self
+
+    def _iter_texts(self, texts):
+        for text in texts:
+            if self.text_is_relevant(text):
+                token_ids = (self.token2id[word] if word in self.relevant_words else None
+                             for word in text)
+                yield [self.id2contiguous[_id] if _id is not None else None for _id in token_ids]
+
     def text_is_relevant(self, text):
         """Return True if the text has any relevant words, else False."""
         for word in text:
@@ -188,20 +201,14 @@ def text_is_relevant(self, text):
                 return True
         return False
 
-    def accumulate(self, texts, window_size):
-        relevant_texts = (text for text in texts if self.text_is_relevant(text))
-        for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False):
-            self.analyze_text(virtual_document)
-            self.num_docs += 1
-        return self
-
 
 class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased):
     """Build an inverted index from a sequence of corpus texts."""
 
-    def analyze_text(self, window):
-        for word_id in self.filter_to_relevant_words(window):
-            self._inverted_index[word_id].add(self._num_docs)
+    def analyze_text(self, window, doc_num=None):
+        for word_id in window:
+            if word_id is not None:
+                self._inverted_index[word_id].add(self._num_docs)
 
 
 class WordOccurrenceAccumulator(WindowedTextsAnalyzer):
@@ -216,15 +223,6 @@ def __init__(self, *args):
     def __str__(self):
         return self.__class__.__name__
 
-    def analyze_text(self, window):
-        relevant_words = list(self.filter_to_relevant_words(window))
-        if relevant_words:
-            uniq_words = np.array(relevant_words)
-            self._occurrences[uniq_words] += 1
-
-            for combo in itertools.combinations(relevant_words, 2):
-                self._co_occurrences[combo] += 1
-
     def accumulate(self, texts, window_size):
         self._co_occurrences = self._co_occurrences.tolil()
         self.partial_accumulate(texts, window_size)
@@ -237,7 +235,31 @@ def partial_accumulate(self, texts, window_size):
         This method does not ensure the co-occurrence matrix is in lil format and does not
         symmetrize it after accumulation.
         """
+        self._current_doc_num = -1
+        self._token_at_edge = None
         super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
+        return self
+
+    def analyze_text(self, window, doc_num=None):
+        if doc_num != self._current_doc_num:
+            self._uniq_words = set(window)
+            self._uniq_words.discard(None)
+            self._token_at_edge = window[0]
+            self._current_doc_num = doc_num
+        else:
+            if self._token_at_edge is not None:
+                self._uniq_words.remove(self._token_at_edge)
+            self._token_at_edge = window[0]
+
+            if window[-1] is not None:
+                self._uniq_words.add(window[-1])
+
+        if self._uniq_words:
+            words_idx = np.array(list(self._uniq_words))
+            self._occurrences[words_idx] += 1
+
+            for combo in itertools.combinations(words_idx, 2):
+                self._co_occurrences[combo] += 1
 
     def _symmetrize(self):
         """Word pairs may have been encountered in (i, j) and (j, i) order.
diff --git a/gensim/utils.py b/gensim/utils.py
index f0488d2943..8b57871d5a 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -1229,7 +1229,7 @@ def strided_windows(ndarray, window_size):
         strides=(stride, stride))
 
 
-def iter_windows(texts, window_size, copy=False, ignore_below_size=True):
+def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False):
     """Produce a generator over the given texts using a sliding window of `window_size`.
     The windows produced are views of some subsequence of a text. To use deep copies
     instead, pass `copy=True`.
@@ -1243,11 +1243,20 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True):
                        If False, the documents below `window_size` will be yielded as the full document.
 
     """
-    for document in texts:
-        doc_windows = strided_windows(document, window_size)
-        if doc_windows.shape[0] == 0:
-            if not ignore_below_size:
-                yield document.copy() if copy else document
-        else:
-            for doc_window in doc_windows:
-                yield doc_window.copy() if copy else doc_window
+    for doc_num, document in enumerate(texts):
+        for window in _iter_windows(document, window_size, copy, ignore_below_size):
+            if include_doc_num:
+                yield (doc_num, window)
+            else:
+                yield window
+
+
+def _iter_windows(document, window_size, copy=False, ignore_below_size=True):
+    doc_windows = strided_windows(document, window_size)
+    if doc_windows.shape[0] == 0:
+        if not ignore_below_size:
+            yield document.copy() if copy else document
+    else:
+        for doc_window in doc_windows:
+            yield doc_window.copy() if copy else doc_window
+

From e7857734f0d44b71b80ec4e3f3ef1ef6bb2eaa47 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 31 May 2017 10:42:32 -0400
Subject: [PATCH 17/33] #1342: Minor bug fixes and improved logging in
 text_analysis module; cleaned up spacing in coherencemodel.

---
 gensim/models/coherencemodel.py         |  4 ++--
 gensim/topic_coherence/text_analysis.py | 14 +++++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 15d680a06c..adcac0f27a 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -73,9 +73,9 @@ class CoherenceModel(interfaces.TransformationABC):
 
     1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
     2. the ``get_coherence()`` method, which returns the topic coherence.
-    
+
     Pipeline phases can also be executed individually. Methods for doing this are:
-    
+
     1. `segment_topics()`, which performs segmentation of the given topics into their comparison sets.
     2. `estimate_probabilities()`, which accumulates word occurrence stats from the given corpus or texts.
         The output of this is also cached on the `CoherenceModel`, so calling this method can be used as
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index b2b43e9382..0a61c5ba0e 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -10,8 +10,9 @@
 """
 
 import sys
-import itertools
 import logging
+import itertools
+import traceback
 import multiprocessing as mp
 
 import numpy as np
@@ -248,7 +249,7 @@ def analyze_text(self, window, doc_num=None):
             self._current_doc_num = doc_num
         else:
             if self._token_at_edge is not None:
-                self._uniq_words.remove(self._token_at_edge)
+                self._uniq_words.discard(self._token_at_edge)  # may be irrelevant token
             self._token_at_edge = window[0]
 
             if window[-1] is not None:
@@ -351,7 +352,7 @@ def queue_all_texts(self, q, texts, window_size):
             self._num_docs += sum(len(doc) - window_size + 1 for doc in batch)
             if before < (self._num_docs / self.log_every):
                 logger.info("submitted %d batches to accumulate stats from %d documents (%d virtual)" % (
-                    batch_num, batch_num * self.batch_size, self._num_docs))
+                    batch_num, (batch_num + 1) * self.batch_size, self._num_docs))
 
     def terminate_workers(self, input_q, output_q, workers, interrupted=False):
         """Wait until all workers have transmitted their WordOccurrenceAccumulator instances,
@@ -394,6 +395,8 @@ def merge_accumulators(self, accumulators):
         # Workers perform partial accumulation, so none of the co-occurrence matrices are symmetrized.
         # This is by design, to avoid unnecessary matrix additions during accumulation.
         accumulator._symmetrize()
+        logger.info("accumulated word occurrence stats for %d virtual documents" %
+                    accumulator.num_docs)
         return accumulator
 
 
@@ -411,9 +414,13 @@ def __init__(self, input_q, output_q, accumulator, window_size):
     def run(self):
         try:
             self._run()
+            print("finished normally")
         except KeyboardInterrupt:
             logger.info("%s interrupted after processing %d documents" % (
                 self.__class__.__name__, self.accumulator.num_docs))
+        except Exception as e:
+            logger.error("worker encountered unexpected exception: %s" % e)
+            logger.error(traceback.format_exc())
         finally:
             self.reply_to_master()
 
@@ -423,6 +430,7 @@ def _run(self):
         while True:
             docs = self.input_q.get(block=True)
             if docs is None:  # sentinel value
+                logger.debug("observed sentinel value; terminating")
                 break
 
             self.accumulator.partial_accumulate(docs, self.window_size)

From 5f78cdb2bcea50975fcf8cabb3565f337406ed59 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 31 May 2017 14:03:32 -0400
Subject: [PATCH 18/33] #1342: Optimize data structures being used for window
 set tracking and avoid undue network traffic by moving relevancy filtering
 and token conversion to the master process.

---
 gensim/topic_coherence/text_analysis.py | 123 ++++++++++++++----------
 1 file changed, 70 insertions(+), 53 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 0a61c5ba0e..81989992d9 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -51,6 +51,7 @@ class BaseAnalyzer(object):
 
     def __init__(self, relevant_ids):
         self.relevant_ids = relevant_ids
+        self._vocab_size = len(self.relevant_ids)
         self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)}
         self.log_every = 1000
         self._num_docs = 0
@@ -92,7 +93,8 @@ def _get_co_occurrences(self, word_id1, word_id2):
 
 class UsesDictionary(BaseAnalyzer):
     """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts.
-    The standard BaseAnalyzer can only deal with token ids since it does not have access to the token2id mapping.
+    The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id 
+    mapping.
     """
 
     def __init__(self, relevant_ids, dictionary):
@@ -128,8 +130,7 @@ class InvertedIndexBased(BaseAnalyzer):
 
     def __init__(self, *args):
         super(InvertedIndexBased, self).__init__(*args)
-        vocab_size = len(self.relevant_ids)
-        self._inverted_index = np.array([set() for _ in range(vocab_size)])
+        self._inverted_index = np.array([set() for _ in range(self._vocab_size)])
 
     def _get_occurrences(self, word_id):
         return len(self._inverted_index[word_id])
@@ -169,15 +170,10 @@ def __init__(self, relevant_ids, dictionary):
         Args:
         ----
         relevant_ids: the set of words that occurrences should be accumulated for.
-        dictionary: gensim.corpora.dictionary.Dictionary instance with mappings for the relevant_ids.
+        dictionary: Dictionary instance with mappings for the relevant_ids.
         """
         super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary)
-
-    def filter_to_relevant_words(self, text):
-        """Lazily filter the text to only those words which are relevant."""
-        relevant_words = (word for word in text if word in self.relevant_words)
-        relevant_ids = (self.token2id[word] for word in relevant_words)
-        return (self.id2contiguous[word_id] for word_id in relevant_ids)
+        self._none_token = self._vocab_size  # see _iter_texts for use of none token
 
     def accumulate(self, texts, window_size):
         relevant_texts = self._iter_texts(texts)
@@ -189,11 +185,13 @@ def accumulate(self, texts, window_size):
         return self
 
     def _iter_texts(self, texts):
+        dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32
         for text in texts:
             if self.text_is_relevant(text):
-                token_ids = (self.token2id[word] if word in self.relevant_words else None
-                             for word in text)
-                yield [self.id2contiguous[_id] if _id is not None else None for _id in token_ids]
+                yield np.array([
+                    self.id2contiguous[self.token2id[w]] if w in self.relevant_words
+                    else self._none_token
+                    for w in text], dtype=dtype)
 
     def text_is_relevant(self, text):
         """Return True if the text has any relevant words, else False."""
@@ -208,7 +206,7 @@ class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased):
 
     def analyze_text(self, window, doc_num=None):
         for word_id in window:
-            if word_id is not None:
+            if word_id is not self._none_token:
                 self._inverted_index[word_id].add(self._num_docs)
 
 
@@ -217,9 +215,11 @@ class WordOccurrenceAccumulator(WindowedTextsAnalyzer):
 
     def __init__(self, *args):
         super(WordOccurrenceAccumulator, self).__init__(*args)
-        vocab_size = len(self.relevant_words)
-        self._occurrences = np.zeros(vocab_size, dtype='uint32')
-        self._co_occurrences = sps.lil_matrix((vocab_size, vocab_size), dtype='uint32')
+        self._occurrences = np.zeros(self._vocab_size, dtype='uint32')
+        self._co_occurrences = sps.lil_matrix((self._vocab_size, self._vocab_size), dtype='uint32')
+
+        self._uniq_words = np.zeros((self._vocab_size + 1,), dtype=bool)  # add 1 for none token
+        self._mask = self._uniq_words[:-1]  # to exclude none token
 
     def __str__(self):
         return self.__class__.__name__
@@ -242,25 +242,23 @@ def partial_accumulate(self, texts, window_size):
         return self
 
     def analyze_text(self, window, doc_num=None):
+        self.slide_window(window, doc_num)
+        if self._mask.any():
+            self._occurrences[self._mask] += 1
+
+            for combo in itertools.combinations(np.nonzero(mask)[0], 2):
+                self._co_occurrences[combo] += 1
+
+    def slide_window(self, window, doc_num):
         if doc_num != self._current_doc_num:
-            self._uniq_words = set(window)
-            self._uniq_words.discard(None)
-            self._token_at_edge = window[0]
+            self._uniq_words[:] = False
+            self._uniq_words[np.unique(window)] = True
             self._current_doc_num = doc_num
         else:
-            if self._token_at_edge is not None:
-                self._uniq_words.discard(self._token_at_edge)  # may be irrelevant token
-            self._token_at_edge = window[0]
+            self._uniq_words[self._token_at_edge] = False
+            self._uniq_words[window[-1]] = True
 
-            if window[-1] is not None:
-                self._uniq_words.add(window[-1])
-
-        if self._uniq_words:
-            words_idx = np.array(list(self._uniq_words))
-            self._occurrences[words_idx] += 1
-
-            for combo in itertools.combinations(words_idx, 2):
-                self._co_occurrences[combo] += 1
+        self._token_at_edge = window[0]
 
     def _symmetrize(self):
         """Word pairs may have been encountered in (i, j) and (j, i) order.
@@ -283,15 +281,31 @@ def merge(self, other):
         self._num_docs += other._num_docs
 
 
+class PatchedWordOccurrenceAccumulator(WordOccurrenceAccumulator):
+    """Monkey patched for multiprocessing worker usage,
+    to move some of the logic to the master process.
+    """
+    def _iter_texts(self, texts):
+        return texts  # master process will handle this
+
+
 class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer):
     """Accumulate word occurrences in parallel."""
 
     def __init__(self, processes, *args, **kwargs):
+        """
+        Args:
+        ----
+        processes : number of processes to use; must be at least two.
+        args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`).
+        kwargs : can include `batch_size`, which is the number of docs to send to a worker at a 
+                 time. If not included, it defaults to 32.
+        """
         super(ParallelWordOccurrenceAccumulator, self).__init__(*args)
         if processes < 2:
-            raise ValueError("Must have at least 2 processes to run in parallel; got %d" % processes)
+            raise ValueError("Must have at least 2 processes to run in parallel; got %d", processes)
         self.processes = processes
-        self.batch_size = kwargs.get('batch_size', 16)
+        self.batch_size = kwargs.get('batch_size', 32)
 
     def __str__(self):
         return "%s(processes=%s, batch_size=%s)" % (
@@ -303,7 +317,8 @@ def accumulate(self, texts, window_size):
             self.queue_all_texts(input_q, texts, window_size)
             interrupted = False
         except KeyboardInterrupt:
-            logger.warn("stats accumulation interrupted; <= %d documents processed" % self._num_docs)
+            logger.warn("stats accumulation interrupted; <= %d documents processed",
+                        self._num_docs)
             interrupted = True
 
         accumulators = self.terminate_workers(input_q, output_q, workers, interrupted)
@@ -320,7 +335,7 @@ def start_workers(self, window_size):
         output_q = mp.Queue()
         workers = []
         for _ in range(self.processes):
-            accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary)
+            accumulator = PatchedWordOccurrenceAccumulator(self.relevant_ids, self.dictionary)
             worker = AccumulatingWorker(input_q, output_q, accumulator, window_size)
             worker.start()
             workers.append(worker)
@@ -332,7 +347,7 @@ def yield_batches(self, texts):
         `batch_size` texts at a time.
         """
         batch = []
-        for text in texts:
+        for text in self._iter_texts(texts):
             batch.append(text)
             if len(batch) == self.batch_size:
                 yield batch
@@ -345,14 +360,14 @@ def queue_all_texts(self, q, texts, window_size):
         """Sequentially place batches of texts on the given queue until `texts` is consumed.
         The texts are filtered so that only those with at least one relevant token are queued.
         """
-        relevant_texts = (text for text in texts if self.text_is_relevant(text))
-        for batch_num, batch in enumerate(self.yield_batches(relevant_texts)):
+        for batch_num, batch in enumerate(self.yield_batches(texts)):
             q.put(batch, block=True)
             before = self._num_docs / self.log_every
             self._num_docs += sum(len(doc) - window_size + 1 for doc in batch)
             if before < (self._num_docs / self.log_every):
-                logger.info("submitted %d batches to accumulate stats from %d documents (%d virtual)" % (
-                    batch_num, (batch_num + 1) * self.batch_size, self._num_docs))
+                logger.info("%d batches submitted to accumulate stats from %d documents (%d "
+                            "virtual)",
+                            (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs)
 
     def terminate_workers(self, input_q, output_q, workers, interrupted=False):
         """Wait until all workers have transmitted their WordOccurrenceAccumulator instances,
@@ -392,10 +407,10 @@ def merge_accumulators(self, accumulators):
         accumulator = accumulators[0]
         for other_accumulator in accumulators[1:]:
             accumulator.merge(other_accumulator)
-        # Workers perform partial accumulation, so none of the co-occurrence matrices are symmetrized.
-        # This is by design, to avoid unnecessary matrix additions during accumulation.
+        # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized.
+        # This is by design, to avoid unnecessary matrix additions/conversions during accumulation.
         accumulator._symmetrize()
-        logger.info("accumulated word occurrence stats for %d virtual documents" %
+        logger.info("accumulated word occurrence stats for %d virtual documents",
                     accumulator.num_docs)
         return accumulator
 
@@ -414,20 +429,20 @@ def __init__(self, input_q, output_q, accumulator, window_size):
     def run(self):
         try:
             self._run()
-            print("finished normally")
         except KeyboardInterrupt:
-            logger.info("%s interrupted after processing %d documents" % (
-                self.__class__.__name__, self.accumulator.num_docs))
+            logger.info("%s interrupted after processing %d documents",
+                        self.__class__.__name__, self.accumulator.num_docs)
         except Exception as e:
-            logger.error("worker encountered unexpected exception: %s" % e)
-            logger.error(traceback.format_exc())
+            logger.error("worker encountered unexpected exception: %s\n%s",
+                         e, traceback.format_exc())
         finally:
             self.reply_to_master()
 
     def _run(self):
-        batch_num = 0
+        batch_num = -1
         n_docs = 0
         while True:
+            batch_num += 1
             docs = self.input_q.get(block=True)
             if docs is None:  # sentinel value
                 logger.debug("observed sentinel value; terminating")
@@ -435,9 +450,11 @@ def _run(self):
 
             self.accumulator.partial_accumulate(docs, self.window_size)
             n_docs += len(docs)
-            logger.debug("completed batch %d; %d documents processed (%d virtual)" % (
-                batch_num, n_docs, self.accumulator.num_docs))
-            batch_num += 1
+            logger.debug("completed batch %d; %d documents processed (%d virtual)",
+                         batch_num, n_docs, self.accumulator.num_docs)
+
+        logger.debug("finished all batches; %d documents processed (%d virtual)",
+                     n_docs, self.accumulator.num_docs)
 
     def reply_to_master(self):
         logger.info("serializing accumulator to return to master...")

From bbd27482f140521f64d1a396f9c2b91168881cb1 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 31 May 2017 14:04:12 -0400
Subject: [PATCH 19/33] #1342: Fix accidental typo.

---
 gensim/topic_coherence/text_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 81989992d9..2424ad9ce9 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -246,7 +246,7 @@ def analyze_text(self, window, doc_num=None):
         if self._mask.any():
             self._occurrences[self._mask] += 1
 
-            for combo in itertools.combinations(np.nonzero(mask)[0], 2):
+            for combo in itertools.combinations(np.nonzero(self._mask)[0], 2):
                 self._co_occurrences[combo] += 1
 
     def slide_window(self, window, doc_num):

From 5fb0b959039586d366bd1f128108d105aa338550 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 31 May 2017 14:31:26 -0400
Subject: [PATCH 20/33] #1342: Further optimize word co-occurrence accumulation
 by using a `collections.Counter` instance for accumulation within a batch.

---
 gensim/topic_coherence/text_analysis.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 2424ad9ce9..371cfd22f5 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -14,6 +14,7 @@
 import itertools
 import traceback
 import multiprocessing as mp
+from collections import Counter
 
 import numpy as np
 import scipy.sparse as sps
@@ -93,7 +94,7 @@ def _get_co_occurrences(self, word_id1, word_id2):
 
 class UsesDictionary(BaseAnalyzer):
     """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts.
-    The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id 
+    The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id
     mapping.
     """
 
@@ -220,6 +221,7 @@ def __init__(self, *args):
 
         self._uniq_words = np.zeros((self._vocab_size + 1,), dtype=bool)  # add 1 for none token
         self._mask = self._uniq_words[:-1]  # to exclude none token
+        self._counter = Counter()
 
     def __str__(self):
         return self.__class__.__name__
@@ -238,18 +240,21 @@ def partial_accumulate(self, texts, window_size):
         """
         self._current_doc_num = -1
         self._token_at_edge = None
+        self._counter.clear()
+
         super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
+        for combo, count in self._counter.iteritems():
+            self._co_occurrences[combo] += count
+
         return self
 
     def analyze_text(self, window, doc_num=None):
-        self.slide_window(window, doc_num)
+        self._slide_window(window, doc_num)
         if self._mask.any():
             self._occurrences[self._mask] += 1
+            self._counter.update(itertools.combinations(np.nonzero(self._mask)[0], 2))
 
-            for combo in itertools.combinations(np.nonzero(self._mask)[0], 2):
-                self._co_occurrences[combo] += 1
-
-    def slide_window(self, window, doc_num):
+    def _slide_window(self, window, doc_num):
         if doc_num != self._current_doc_num:
             self._uniq_words[:] = False
             self._uniq_words[np.unique(window)] = True
@@ -298,14 +303,14 @@ def __init__(self, processes, *args, **kwargs):
         ----
         processes : number of processes to use; must be at least two.
         args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`).
-        kwargs : can include `batch_size`, which is the number of docs to send to a worker at a 
-                 time. If not included, it defaults to 32.
+        kwargs : can include `batch_size`, which is the number of docs to send to a worker at a
+                 time. If not included, it defaults to 64.
         """
         super(ParallelWordOccurrenceAccumulator, self).__init__(*args)
         if processes < 2:
             raise ValueError("Must have at least 2 processes to run in parallel; got %d", processes)
         self.processes = processes
-        self.batch_size = kwargs.get('batch_size', 32)
+        self.batch_size = kwargs.get('batch_size', 64)
 
     def __str__(self):
         return "%s(processes=%s, batch_size=%s)" % (

From 880b8d08d146dc5c3affdb8efaa28c77e077db50 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 1 Jun 2017 10:47:33 -0400
Subject: [PATCH 21/33] #1342: Clean up logging in `text_analysis` module and
 remove empty line at end of `util` module.

---
 gensim/topic_coherence/text_analysis.py | 17 ++++++++---------
 gensim/utils.py                         |  1 -
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 371cfd22f5..6062c445b0 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -65,8 +65,8 @@ def num_docs(self):
     def num_docs(self, num):
         self._num_docs = num
         if self._num_docs % self.log_every == 0:
-            logger.info("%s accumulated stats from %d documents" % (
-                self.__class__.__name__, self._num_docs))
+            logger.info("%s accumulated stats from %d documents",
+                        self.__class__.__name__, self._num_docs)
 
     def analyze_text(self, text, doc_num=None):
         raise NotImplementedError("Base classes should implement analyze_text.")
@@ -370,9 +370,9 @@ def queue_all_texts(self, q, texts, window_size):
             before = self._num_docs / self.log_every
             self._num_docs += sum(len(doc) - window_size + 1 for doc in batch)
             if before < (self._num_docs / self.log_every):
-                logger.info("%d batches submitted to accumulate stats from %d documents (%d "
-                            "virtual)",
-                            (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs)
+                logger.info(
+                    "%d batches submitted to accumulate stats from %d documents (%d virtual)",
+                    (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs)
 
     def terminate_workers(self, input_q, output_q, workers, interrupted=False):
         """Wait until all workers have transmitted their WordOccurrenceAccumulator instances,
@@ -394,7 +394,7 @@ def terminate_workers(self, input_q, output_q, workers, interrupted=False):
         accumulators = []
         while len(accumulators) != len(workers):
             accumulators.append(output_q.get())
-        logger.info("%d accumulators retrieved from output queue" % len(accumulators))
+        logger.info("%d accumulators retrieved from output queue", len(accumulators))
 
         for worker in workers:
             if worker.is_alive():
@@ -437,9 +437,8 @@ def run(self):
         except KeyboardInterrupt:
             logger.info("%s interrupted after processing %d documents",
                         self.__class__.__name__, self.accumulator.num_docs)
-        except Exception as e:
-            logger.error("worker encountered unexpected exception: %s\n%s",
-                         e, traceback.format_exc())
+        except:
+            logger.exception("worker encountered unexpected exception")
         finally:
             self.reply_to_master()
 
diff --git a/gensim/utils.py b/gensim/utils.py
index 8b57871d5a..dd391f887b 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -1259,4 +1259,3 @@ def _iter_windows(document, window_size, copy=False, ignore_below_size=True):
     else:
         for doc_window in doc_windows:
             yield doc_window.copy() if copy else doc_window
-

From 1d32b8eb8d29f3729a8029e7deacab159d1f03e5 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 1 Jun 2017 11:07:20 -0400
Subject: [PATCH 22/33] #1342: Remove unused traceback module.

---
 gensim/topic_coherence/text_analysis.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 6062c445b0..6a6cd6aaae 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -12,7 +12,6 @@
 import sys
 import logging
 import itertools
-import traceback
 import multiprocessing as mp
 from collections import Counter
 

From 8e04b416cf0c6459dbef041dbc5345ac191a7e3c Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 1 Jun 2017 11:47:07 -0400
Subject: [PATCH 23/33] #1342: Fixes for python3 compatibility.

---
 gensim/topic_coherence/text_analysis.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 6a6cd6aaae..d73a3f7b8e 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import scipy.sparse as sps
+from six import iteritems
 
 from gensim import utils
 
@@ -141,7 +142,7 @@ def _get_co_occurrences(self, word_id1, word_id2):
         return len(s1.intersection(s2))
 
     def index_to_dict(self):
-        contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()}
+        contiguous2id = {n: word_id for word_id, n in iteritems(self.id2contiguous)}
         return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
 
 
@@ -242,7 +243,7 @@ def partial_accumulate(self, texts, window_size):
         self._counter.clear()
 
         super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
-        for combo, count in self._counter.iteritems():
+        for combo, count in iteritems(self._counter):
             self._co_occurrences[combo] += count
 
         return self
@@ -427,7 +428,7 @@ def __init__(self, input_q, output_q, accumulator, window_size):
         self.input_q = input_q
         self.output_q = output_q
         self.accumulator = accumulator
-        self.accumulator.log_every = sys.maxint  # avoid logging in workers
+        self.accumulator.log_every = sys.maxsize  # avoid logging in workers
         self.window_size = window_size
 
     def run(self):

From e3ce40244d8514d4d2311526f7613a2bd689a643 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 1 Jun 2017 13:56:35 -0400
Subject: [PATCH 24/33] #1342: Hopefully `six.viewitems` works for python3
 compatibility?

---
 gensim/topic_coherence/text_analysis.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index d73a3f7b8e..7e8e57d703 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import scipy.sparse as sps
-from six import iteritems
+from six import viewitems
 
 from gensim import utils
 
@@ -142,7 +142,7 @@ def _get_co_occurrences(self, word_id1, word_id2):
         return len(s1.intersection(s2))
 
     def index_to_dict(self):
-        contiguous2id = {n: word_id for word_id, n in iteritems(self.id2contiguous)}
+        contiguous2id = {n: word_id for word_id, n in viewitems(self.id2contiguous)}
         return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
 
 
@@ -243,7 +243,7 @@ def partial_accumulate(self, texts, window_size):
         self._counter.clear()
 
         super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
-        for combo, count in iteritems(self._counter):
+        for combo, count in viewitems(self._counter):
             self._co_occurrences[combo] += count
 
         return self

From 7f7f55daf335de54793e63349aff22ce0ce123f2 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 1 Jun 2017 16:19:34 -0400
Subject: [PATCH 25/33] #1342: Realized the python3 compatibility issue was due
 to the Dictionary mapping to different ids, so fixed the
 `probability_estimation` tests to be agnostic of this. Also fixed an issue
 with the interpretation of strings as iterables when getting occurrences of
 strings in the `text_analysis.BaseAnalyzer.__getitem__` method.

---
 gensim/test/test_probability_estimation.py | 167 +++++++++------------
 gensim/topic_coherence/text_analysis.py    |   8 +-
 2 files changed, 75 insertions(+), 100 deletions(-)

diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py
index f87b7bc564..982230a526 100644
--- a/gensim/test/test_probability_estimation.py
+++ b/gensim/test/test_probability_estimation.py
@@ -16,107 +16,82 @@
 from gensim.corpora.dictionary import Dictionary
 
 
-class ProbabilityEstimationBase(unittest.TestCase):
-    texts = [['human', 'interface', 'computer'],
-             ['eps', 'user', 'interface', 'system'],
-             ['system', 'human', 'system', 'eps'],
-             ['user', 'response', 'time'],
-             ['trees'],
-             ['graph', 'trees']]
+class BaseTestCases(object):
 
+    class ProbabilityEstimationBase(unittest.TestCase):
+        texts = [['human', 'interface', 'computer'],
+                 ['eps', 'user', 'interface', 'system'],
+                 ['system', 'human', 'system', 'eps'],
+                 ['user', 'response', 'time'],
+                 ['trees'],
+                 ['graph', 'trees']]
+        dictionary = None
 
-class TestProbabilityEstimation(ProbabilityEstimationBase):
-    def setUp(self):
-        self.dictionary = HashDictionary(self.texts)
-        # Following is the mapping:
-        # {'computer': 10608,
-        #  'eps': 31049,
-        #  'graph': 18451,
-        #  'human': 31002,
-        #  'interface': 12466,
-        #  'response': 5232,
-        #  'system': 5798,
-        #  'time': 29104,
-        #  'trees': 23844,
-        #  'user': 12736}
-        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
-        # Suppose the segmented topics from s_one_pre are:
-        self.segmented_topics = [
-            [
-                (5798, 18451),
-                (10608, 18451),
-                (10608, 5798)
-            ], [
-                (10608, 18451),
-                (12736, 18451),
-                (12736, 10608)
+        def build_segmented_topics(self):
+            # Suppose the segmented topics from s_one_pre are:
+            token2id = self.dictionary.token2id
+            computer_id = token2id['computer']
+            system_id = token2id['system']
+            user_id = token2id['user']
+            graph_id = token2id['graph']
+            self.segmented_topics = [
+                [
+                    (system_id, graph_id),
+                    (computer_id, graph_id),
+                    (computer_id, system_id)
+                ], [
+                    (computer_id, graph_id),
+                    (user_id, graph_id),
+                    (user_id, computer_id)
+                ]
             ]
-        ]
-
-    def testPBooleanDocument(self):
-        """Test p_boolean_document()"""
-        # Unique topic ids are 5798, 10608, 12736 and 18451
-        accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
-        obtained = accumulator.index_to_dict()
-        expected = {18451: {5}, 12736: {1, 3}, 5798: {1, 2}, 10608: {0}}
-        self.assertEqual(expected, obtained)
-
-    def testPBooleanSlidingWindow(self):
-        """Test p_boolean_sliding_window()"""
-        # Test with window size as 2. window_id is zero indexed.
-        accumulator = probability_estimation.p_boolean_sliding_window(
-            self.texts, self.segmented_topics, self.dictionary, 2)
-        self.assertEqual(1, accumulator[10608])
-        self.assertEqual(3, accumulator[12736])
-        self.assertEqual(1, accumulator[18451])
-        self.assertEqual(4, accumulator[5798])
-
-
-class TestProbabilityEstimationWithNormalDictionary(ProbabilityEstimationBase):
-    def setUp(self):
+
+            self.computer_id = computer_id
+            self.system_id = system_id
+            self.user_id = user_id
+            self.graph_id = graph_id
+
+        def setup_dictionary(self):
+            raise NotImplementedError
+
+        def setUp(self):
+            self.setup_dictionary()
+            self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
+            self.build_segmented_topics()
+
+        def testPBooleanDocument(self):
+            """Test p_boolean_document()"""
+            accumulator = probability_estimation.p_boolean_document(
+                self.corpus, self.segmented_topics)
+            obtained = accumulator.index_to_dict()
+            expected = {
+                self.graph_id: {5},
+                self.user_id: {1, 3},
+                self.system_id: {1, 2},
+                self.computer_id: {0}
+            }
+            self.assertEqual(expected, obtained)
+
+        def testPBooleanSlidingWindow(self):
+            """Test p_boolean_sliding_window()"""
+            # Test with window size as 2. window_id is zero indexed.
+            accumulator = probability_estimation.p_boolean_sliding_window(
+                self.texts, self.segmented_topics, self.dictionary, 2)
+            self.assertEqual(1, accumulator[self.computer_id])
+            self.assertEqual(3, accumulator[self.user_id])
+            self.assertEqual(1, accumulator[self.graph_id])
+            self.assertEqual(4, accumulator[self.system_id])
+
+
+class TestProbabilityEstimation(BaseTestCases.ProbabilityEstimationBase):
+    def setup_dictionary(self):
+        self.dictionary = HashDictionary(self.texts)
+
+
+class TestProbabilityEstimationWithNormalDictionary(BaseTestCases.ProbabilityEstimationBase):
+    def setup_dictionary(self):
         self.dictionary = Dictionary(self.texts)
         self.dictionary.id2token = {v: k for k, v in self.dictionary.token2id.items()}
-        # Following is the mapping:
-        # {u'computer': 1,
-        #  u'eps': 5,
-        #  u'graph': 9,
-        #  u'human': 2,
-        #  u'interface': 0,
-        #  u'response': 6,
-        #  u'system': 4,
-        #  u'time': 7,
-        #  u'trees': 8,
-        #  u'user': 3}
-        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
-        # Suppose the segmented topics from s_one_pre are:
-        self.segmented_topics = [
-            [
-                (4, 9),
-                (1, 9),
-                (1, 4)
-            ], [
-                (1, 9),
-                (3, 9),
-                (3, 1)
-            ]
-        ]
-
-    def testPBooleanDocument(self):
-        """Test p_boolean_document()"""
-        accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
-        obtained = accumulator.index_to_dict()
-        expected = {9: {5}, 3: {1, 3}, 4: {1, 2}, 1: {0}}
-        self.assertEqual(expected, obtained)
-
-    def testPBooleanSlidingWindow(self):
-        """Test p_boolean_sliding_window()"""
-        # Test with window size as 2. window_id is zero indexed.
-        accumulator = probability_estimation.p_boolean_sliding_window(
-            self.texts, self.segmented_topics, self.dictionary, 2)
-        self.assertEqual(1, accumulator[1])
-        self.assertEqual(3, accumulator[3])
-        self.assertEqual(1, accumulator[9])
-        self.assertEqual(4, accumulator[4])
 
 
 if __name__ == '__main__':
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 7e8e57d703..8cdf1027fd 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import scipy.sparse as sps
-from six import viewitems
+from six import viewitems, string_types
 
 from gensim import utils
 
@@ -72,10 +72,10 @@ def analyze_text(self, text, doc_num=None):
         raise NotImplementedError("Base classes should implement analyze_text.")
 
     def __getitem__(self, word_or_words):
-        if hasattr(word_or_words, '__iter__'):
-            return self.get_co_occurrences(*word_or_words)
-        else:
+        if isinstance(word_or_words, string_types) or not hasattr(word_or_words, '__iter__'):
             return self.get_occurrences(word_or_words)
+        else:
+            return self.get_co_occurrences(*word_or_words)
 
     def get_occurrences(self, word_id):
         """Return number of docs the word occurs in, once `accumulate` has been called."""

From 343da69f0c49e71131f71910ef5d70250d73285c Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Fri, 2 Jun 2017 08:47:04 -0400
Subject: [PATCH 26/33] #1342: Fixed a few bugs and added test coverage for the
 coherencemodel accumulator caching; made model a property with a setter that
 also sets the topics and uncaches the accumulator if the model's topics have
 ids not tracked by the accumulator.

---
 gensim/models/coherencemodel.py    |  74 +++++++------
 gensim/test/test_coherencemodel.py | 166 +++++++++++++++--------------
 2 files changed, 131 insertions(+), 109 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index adcac0f27a..e53d5600ca 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -38,28 +38,28 @@
 
 boolean_document_based = {'u_mass'}
 sliding_window_based = {'c_v', 'c_uci', 'c_npmi'}
-make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
-
-coherence_dict = {
-    'u_mass': make_pipeline(segmentation.s_one_pre,
-                            probability_estimation.p_boolean_document,
-                            direct_confirmation_measure.log_conditional_probability,
-                            aggregation.arithmetic_mean),
-    'c_v': make_pipeline(segmentation.s_one_set,
-                         probability_estimation.p_boolean_sliding_window,
-                         indirect_confirmation_measure.cosine_similarity,
-                         aggregation.arithmetic_mean),
-    'c_uci': make_pipeline(segmentation.s_one_one,
-                           probability_estimation.p_boolean_sliding_window,
-                           direct_confirmation_measure.log_ratio_measure,
-                           aggregation.arithmetic_mean),
-    'c_npmi': make_pipeline(segmentation.s_one_one,
+_make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
+
+COHERENCE_MEASURES = {
+    'u_mass': _make_pipeline(segmentation.s_one_pre,
+                             probability_estimation.p_boolean_document,
+                             direct_confirmation_measure.log_conditional_probability,
+                             aggregation.arithmetic_mean),
+    'c_v': _make_pipeline(segmentation.s_one_set,
+                          probability_estimation.p_boolean_sliding_window,
+                          indirect_confirmation_measure.cosine_similarity,
+                          aggregation.arithmetic_mean),
+    'c_uci': _make_pipeline(segmentation.s_one_one,
                             probability_estimation.p_boolean_sliding_window,
                             direct_confirmation_measure.log_ratio_measure,
                             aggregation.arithmetic_mean),
+    'c_npmi': _make_pipeline(segmentation.s_one_one,
+                             probability_estimation.p_boolean_sliding_window,
+                             direct_confirmation_measure.log_ratio_measure,
+                             aggregation.arithmetic_mean),
 }
 
-sliding_windows_dict = {
+SLIDING_WINDOW_SIZES = {
     'c_v': 110,
     'c_uci': 10,
     'c_npmi': 10
@@ -174,7 +174,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
         elif coherence in sliding_window_based:
             self.window_size = window_size
             if self.window_size is None:
-                self.window_size = sliding_windows_dict[self.coherence]
+                self.window_size = SLIDING_WINDOW_SIZES[self.coherence]
             if texts is None:
                 raise ValueError("'texts' should be provided for %s coherence." % coherence)
             else:
@@ -183,8 +183,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
             raise ValueError("%s coherence is not currently supported." % coherence)
 
         self.topn = topn
-        self.model = model
-
+        self._model = model
         self._accumulator = None
         self._topics = None
         self.topics = topics
@@ -194,9 +193,21 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
     def __str__(self):
         return str(self.measure)
 
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model):
+        self._model = model
+        if model is not None:
+            new_topics = self._get_topics()
+            self._update_accumulator(new_topics)
+            self._topics = new_topics
+
     @property
     def measure(self):
-        return coherence_dict[self.coherence]
+        return COHERENCE_MEASURES[self.coherence]
 
     @property
     def topics(self):
@@ -208,33 +219,34 @@ def topics(self, topics):
         if self.model is not None:
             new_topics = self._get_topics()
             if topics is not None:
-                logger.warn("Ignoring topics you are attempting to set in favor of model's topics: %s" % self.model)
+                logger.warning(
+                    "Ignoring topics you are attempting to set in favor of model's topics: %s",
+                    self.model)
         elif topics is not None:
             new_topics = []
             for topic in topics:
                 topic_token_ids = np.array([self.dictionary.token2id[token] for token in topic])
                 new_topics.append(topic_token_ids)
 
+        self._update_accumulator(new_topics)
+        self._topics = new_topics
+
+    def _update_accumulator(self, new_topics):
         if self._relevant_ids_will_differ(new_topics):
             logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
             self._accumulator = None
 
-        self._topics = new_topics
-
     def _relevant_ids_will_differ(self, new_topics):
-        if not self._topics_differ(new_topics):
+        if self._accumulator is None or not self._topics_differ(new_topics):
             return False
 
-        measure = self.measure
-        current_set = unique_ids_from_segments(measure.seg(self.topics))
-        new_set = unique_ids_from_segments(measure.seg(new_topics))
-        return not current_set.issuperset(new_set)
+        new_set = unique_ids_from_segments(self.measure.seg(new_topics))
+        return not self._accumulator.relevant_ids.issuperset(new_set)
 
     def _topics_differ(self, new_topics):
         return (new_topics is not None and
                     self._topics is not None and
-                    self._accumulator is not None and
-                    not np.equal(new_topics, self._topics).all())
+                    not np.array_equal(new_topics, self._topics))
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 679f115f5b..4827b6ba1e 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -8,33 +8,19 @@
 Automated tests for checking transformation algorithms (the models package).
 """
 
+import os
 import logging
 import unittest
-import os
-import os.path
 import tempfile
 
+import numpy as np
+
 from gensim.models.coherencemodel import CoherenceModel, boolean_document_based
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaMallet
 from gensim.models.wrappers import LdaVowpalWabbit
 from gensim.corpora.dictionary import Dictionary
-
-module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
-
-# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
-         ['survey', 'user', 'computer', 'system', 'response', 'time'],
-         ['eps', 'user', 'interface', 'system'],
-         ['system', 'human', 'system', 'eps'],
-         ['user', 'response', 'time'],
-         ['trees'],
-         ['graph', 'trees'],
-         ['graph', 'minors', 'trees'],
-         ['graph', 'minors', 'survey']]
-dictionary = Dictionary(texts)
-corpus = [dictionary.doc2bow(text) for text in texts]
+from gensim.matutils import argsort
 
 
 def testfile():
@@ -43,6 +29,23 @@ def testfile():
 
 
 class TestCoherenceModel(unittest.TestCase):
+
+    # set up vars used in testing ("Deerwester" from the web tutorial)
+    texts = [['human', 'interface', 'computer'],
+             ['survey', 'user', 'computer', 'system', 'response', 'time'],
+             ['eps', 'user', 'interface', 'system'],
+             ['system', 'human', 'system', 'eps'],
+             ['user', 'response', 'time'],
+             ['trees'],
+             ['graph', 'trees'],
+             ['graph', 'minors', 'trees'],
+             ['graph', 'minors', 'survey']]
+    dictionary = Dictionary(texts)
+
+    @classmethod
+    def setUpClass(cls):
+        cls.corpus = [cls.dictionary.doc2bow(text) for text in cls.texts]
+
     def setUp(self):
         # Suppose given below are the topics which two different LdaModels come up with.
         # `topics1` is clearly better as it has a clear distinction between system-human
@@ -52,28 +55,31 @@ def setUp(self):
                         ['graph', 'minors', 'trees', 'eps']]
         self.topics2 = [['user', 'graph', 'minors', 'system'],
                         ['time', 'graph', 'survey', 'minors']]
-        self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0)
+        self.ldamodel = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=2,
+                                 passes=0, iterations=0)
         mallet_home = os.environ.get('MALLET_HOME', None)
         self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
         if self.mallet_path:
-            self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0)
+            self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=self.corpus,
+                                         id2word=self.dictionary, num_topics=2, iterations=0)
         vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
         if not vw_path:
-            msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
-            logging.info(msg)
+            logging.info("Environment variable 'VOWPAL_WABBIT_PATH' not specified,"
+                         " skipping sanity checks for LDA Model")
             self.vw_path = None
         else:
             self.vw_path = vw_path
-            self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0)
+            self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=self.corpus,
+                                           id2word=self.dictionary, num_topics=2, passes=0)
 
     def check_coherence_measure(self, coherence):
         """Check provided topic coherence algorithm on given topics"""
         if coherence in boolean_document_based:
-            kwargs = dict(corpus=corpus, dictionary=dictionary, coherence=coherence)
+            kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence)
             cm1 = CoherenceModel(topics=self.topics1, **kwargs)
             cm2 = CoherenceModel(topics=self.topics2, **kwargs)
         else:
-            kwargs = dict(texts=texts, dictionary=dictionary, coherence=coherence)
+            kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence)
             cm1 = CoherenceModel(topics=self.topics1, **kwargs)
             cm2 = CoherenceModel(topics=self.topics2, **kwargs)
         self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
@@ -99,127 +105,131 @@ def testUMassLdaModel(self):
         # Note that this is just a sanity check because LDA does not guarantee a better coherence
         # value on the topics if iterations are increased. This can be seen here:
         # https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde
-        try:
-            cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass')
-        except:
-            raise
+        CoherenceModel(model=self.ldamodel, corpus=self.corpus, coherence='u_mass')
 
     def testCvLdaModel(self):
         """Perform sanity check to see if c_v coherence works with LDA Model"""
-        try:
-            cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_v')
-        except:
-            raise
+        CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_v')
 
     def testCuciLdaModel(self):
         """Perform sanity check to see if c_uci coherence works with LDA Model"""
-        try:
-            cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_uci')
-        except:
-            raise
+        CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_uci')
 
     def testCnpmiLdaModel(self):
         """Perform sanity check to see if c_npmi coherence works with LDA Model"""
-        try:
-            cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_npmi')
-        except:
-            raise
+        CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_npmi')
 
     def testUMassMalletModel(self):
         """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper"""
         if not self.mallet_path:
             return
-        try:
-            cm = CoherenceModel(model=self.malletmodel, corpus=corpus, coherence='u_mass')
-        except:
-            raise
+        CoherenceModel(model=self.malletmodel, corpus=self.corpus, coherence='u_mass')
 
     def testCvMalletModel(self):
         """Perform sanity check to see if c_v coherence works with LDA Mallet gensim wrapper"""
         if not self.mallet_path:
             return
-        try:
-            cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_v')
-        except:
-            raise
+        CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_v')
 
     def testCuciMalletModel(self):
         """Perform sanity check to see if c_uci coherence works with LDA Mallet gensim wrapper"""
         if not self.mallet_path:
             return
-        try:
-            cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_uci')
-        except:
-            raise
+        CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_uci')
 
     def testCnpmiMalletModel(self):
         """Perform sanity check to see if c_npmi coherence works with LDA Mallet gensim wrapper"""
         if not self.mallet_path:
             return
-        try:
-            cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_npmi')
-        except:
-            raise
+        CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_npmi')
 
     def testUMassVWModel(self):
         """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper"""
         if not self.vw_path:
             return
-        try:
-            cm = CoherenceModel(model=self.vwmodel, corpus=corpus, coherence='u_mass')
-        except:
-            raise
+        CoherenceModel(model=self.vwmodel, corpus=self.corpus, coherence='u_mass')
 
     def testCvVWModel(self):
         """Perform sanity check to see if c_v coherence works with LDA VW gensim wrapper"""
         if not self.vw_path:
             return
-        try:
-            cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_v')
-        except:
-            raise
+        CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_v')
 
     def testCuciVWModel(self):
         """Perform sanity check to see if c_uci coherence works with LDA VW gensim wrapper"""
         if not self.vw_path:
             return
-        try:
-            cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_uci')
-        except:
-            raise
+        CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_uci')
 
     def testCnpmiVWModel(self):
         """Perform sanity check to see if c_npmi coherence works with LDA VW gensim wrapper"""
         if not self.vw_path:
             return
-        try:
-            cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_npmi')
-        except:
-            raise
+        CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_npmi')
 
     def testErrors(self):
         """Test if errors are raised on bad input"""
         # not providing dictionary
-        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, coherence='u_mass')
+        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
+                          coherence='u_mass')
         # not providing texts for c_v and instead providing corpus
-        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='c_v')
+        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
+                          dictionary=self.dictionary, coherence='c_v')
         # not providing corpus or texts for u_mass
-        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass')
+        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1,
+                          dictionary=self.dictionary, coherence='u_mass')
 
     def testPersistence(self):
         fname = testfile()
-        model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary,
+                               coherence='u_mass')
         model.save(fname)
         model2 = CoherenceModel.load(fname)
         self.assertTrue(model.get_coherence() == model2.get_coherence())
 
     def testPersistenceCompressed(self):
         fname = testfile() + '.gz'
-        model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary,
+                               coherence='u_mass')
         model.save(fname)
         model2 = CoherenceModel.load(fname)
         self.assertTrue(model.get_coherence() == model2.get_coherence())
 
+    def testAccumulatorCachingSameSizeTopics(self):
+        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
+        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
+        cm1.estimate_probabilities()
+        accumulator = cm1._accumulator
+        self.assertIsNotNone(accumulator)
+        cm1.topics = self.topics1
+        self.assertEqual(accumulator, cm1._accumulator)
+        cm1.topics = self.topics2
+        self.assertEqual(None, cm1._accumulator)
+
+    def testAccumulatorCachingTopicSubsets(self):
+        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
+        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
+        cm1.estimate_probabilities()
+        accumulator = cm1._accumulator
+        self.assertIsNotNone(accumulator)
+        cm1.topics = [t[:2] for t in self.topics1]
+        self.assertEqual(accumulator, cm1._accumulator)
+        cm1.topics = self.topics1
+        self.assertEqual(accumulator, cm1._accumulator)
+
+    def testAccumulatorCachingWithModelSetting(self):
+        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
+        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
+        cm1.estimate_probabilities()
+        self.assertIsNotNone(cm1._accumulator)
+        cm1.model = self.ldamodel
+        topics = []
+        for topic in self.ldamodel.state.get_lambda():
+            bestn = argsort(topic, topn=cm1.topn, reverse=True)
+            topics.append(bestn)
+        self.assertTrue(np.array_equal(topics, cm1.topics))
+        self.assertIsNone(cm1._accumulator)
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

From 1ce8a720629e8c920ad27e992f9edc59efa24aae Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Fri, 2 Jun 2017 09:24:11 -0400
Subject: [PATCH 27/33] #1342: Further tests for persistence of accumulator.

---
 gensim/test/test_coherencemodel.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 4827b6ba1e..426a6ef71c 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -195,6 +195,26 @@ def testPersistenceCompressed(self):
         model2 = CoherenceModel.load(fname)
         self.assertTrue(model.get_coherence() == model2.get_coherence())
 
+    def testPersistenceAfterProbabilityEstimationUsingCorpus(self):
+        fname = testfile()
+        model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary,
+                               coherence='u_mass')
+        model.estimate_probabilities()
+        model.save(fname)
+        model2 = CoherenceModel.load(fname)
+        self.assertIsNotNone(model2._accumulator)
+        self.assertTrue(model.get_coherence() == model2.get_coherence())
+
+    def testPersistenceAfterProbabilityEstimationUsingTexts(self):
+        fname = testfile()
+        model = CoherenceModel(topics=self.topics1, texts=self.texts, dictionary=self.dictionary,
+                               coherence='c_v')
+        model.estimate_probabilities()
+        model.save(fname)
+        model2 = CoherenceModel.load(fname)
+        self.assertIsNotNone(model2._accumulator)
+        self.assertTrue(model.get_coherence() == model2.get_coherence())
+
     def testAccumulatorCachingSameSizeTopics(self):
         kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
         cm1 = CoherenceModel(topics=self.topics1, **kwargs)

From 96fd3433ec124b0be0462e14309dfd27c4b580f1 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Sun, 4 Jun 2017 15:52:08 -0400
Subject: [PATCH 28/33] #1342: Add test case for `CorpusAccumulator`.

---
 gensim/test/test_text_analysis.py       | 30 ++++++++++++++++++++++++-
 gensim/topic_coherence/text_analysis.py |  5 ++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index ed6d482b44..c32e6b2ebd 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -2,7 +2,8 @@
 import unittest
 
 from gensim.topic_coherence.text_analysis import \
-    InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
+    InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \
+    CorpusAccumulator
 from gensim.corpora.dictionary import Dictionary
 
 
@@ -145,6 +146,33 @@ def init_accumulator2(self):
         return self.accumulator_cls(2, self.top_ids2, self.dictionary2)
 
 
+class TestCorpusAnalyzer(unittest.TestCase):
+
+    def setUp(self):
+        self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary
+        self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids
+        self.corpus = [self.dictionary.doc2bow(doc)
+                       for doc in BaseTestCases.TextAnalyzerTestBase.texts]
+
+    def test_index_accumulation(self):
+        accumulator = CorpusAccumulator(self.top_ids)\
+            .accumulate(self.corpus)
+        inverted_index = accumulator.index_to_dict()
+        expected = {
+            10: {0, 2, 3},
+            15: {0},
+            20: {0},
+            21: {1, 2, 3},
+            17: {1, 2}
+        }
+        self.assertDictEqual(expected, inverted_index)
+
+        self.assertEqual(3, accumulator.get_occurrences(10))
+        self.assertEqual(2, accumulator.get_occurrences(17))
+        self.assertEqual(2, accumulator.get_co_occurrences(10, 21))
+        self.assertEqual(1, accumulator.get_co_occurrences(10, 17))
+
+
 if __name__ == '__main__':
     logging.root.setLevel(logging.WARNING)
     unittest.main()
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 8cdf1027fd..90d7d83467 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -152,9 +152,8 @@ class CorpusAccumulator(InvertedIndexBased):
     def analyze_text(self, text, doc_num=None):
         doc_words = frozenset(x[0] for x in text)
         top_ids_in_doc = self.relevant_ids.intersection(doc_words)
-        if len(top_ids_in_doc) > 0:
-            for word_id in top_ids_in_doc:
-                self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)
+        for word_id in top_ids_in_doc:
+            self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)
 
     def accumulate(self, corpus):
         for document in corpus:

From a631ab69df0b3e553d6b88f8c5ecfa1e92bcb52f Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 5 Jun 2017 11:08:56 -0400
Subject: [PATCH 29/33] #1342: Formatting fixes for hanging indents and overly
 long lines.

---
 gensim/corpora/wikicorpus.py                  | 23 ++++---
 gensim/models/coherencemodel.py               | 67 +++++++++++--------
 gensim/test/test_text_analysis.py             | 42 ++++++------
 .../direct_confirmation_measure.py            |  7 +-
 .../indirect_confirmation_measure.py          | 31 +++++----
 .../topic_coherence/probability_estimation.py |  4 +-
 gensim/topic_coherence/text_analysis.py       | 42 +++++++-----
 7 files changed, 121 insertions(+), 95 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 13b111db4f..ec032067f1 100755
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -20,13 +20,13 @@
 
 import bz2
 import logging
-import re
-from xml.etree.cElementTree import iterparse  # LXML isn't faster, so let's go with the built-in solution
 import multiprocessing
+import re
 import signal
+from xml.etree.cElementTree import \
+    iterparse  # LXML isn't faster, so let's go with the built-in solution
 
 from gensim import utils
-
 # cannot import whole gensim.corpora, because that imports wikicorpus...
 from gensim.corpora.dictionary import Dictionary
 from gensim.corpora.textcorpus import TextCorpus
@@ -266,7 +266,8 @@ class WikiCorpus(TextCorpus):
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
 
     """
-    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
+    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
+                 filter_namespaces=('0',)):
         """
         Initialize the corpus. Unless a dictionary is provided, this scans the
         corpus once, to determine its vocabulary.
@@ -305,9 +306,10 @@ def get_texts(self):
         """
         articles, articles_all = 0, 0
         positions, positions_all = 0, 0
-        texts = ((text, self.lemmatize, title, pageid)
-                 for title, text, pageid
-                 in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
+        texts = \
+            ((text, self.lemmatize, title, pageid)
+             for title, text, pageid
+             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
         pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
 
         try:
@@ -327,9 +329,10 @@ def get_texts(self):
                     else:
                         yield tokens
         except KeyboardInterrupt:
-            logger.warn("user terminated iteration over Wikipedia corpus after %i documents with %i positions"
-                        " (total %i articles, %i positions before pruning articles shorter than %i words)",
-                        articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+            logger.warn(
+                "user terminated iteration over Wikipedia corpus after %i documents with %i positions"
+                " (total %i articles, %i positions before pruning articles shorter than %i words)",
+                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
         else:
             logger.info(
                 "finished iterating over Wikipedia corpus of %i documents with %i positions"
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index e53d5600ca..d35a266a4a 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -19,8 +19,8 @@
 """
 
 import logging
-from collections import namedtuple
 import multiprocessing as mp
+from collections import namedtuple
 
 import numpy as np
 
@@ -41,22 +41,30 @@
 _make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
 
 COHERENCE_MEASURES = {
-    'u_mass': _make_pipeline(segmentation.s_one_pre,
-                             probability_estimation.p_boolean_document,
-                             direct_confirmation_measure.log_conditional_probability,
-                             aggregation.arithmetic_mean),
-    'c_v': _make_pipeline(segmentation.s_one_set,
-                          probability_estimation.p_boolean_sliding_window,
-                          indirect_confirmation_measure.cosine_similarity,
-                          aggregation.arithmetic_mean),
-    'c_uci': _make_pipeline(segmentation.s_one_one,
-                            probability_estimation.p_boolean_sliding_window,
-                            direct_confirmation_measure.log_ratio_measure,
-                            aggregation.arithmetic_mean),
-    'c_npmi': _make_pipeline(segmentation.s_one_one,
-                             probability_estimation.p_boolean_sliding_window,
-                             direct_confirmation_measure.log_ratio_measure,
-                             aggregation.arithmetic_mean),
+    'u_mass': _make_pipeline(
+        segmentation.s_one_pre,
+        probability_estimation.p_boolean_document,
+        direct_confirmation_measure.log_conditional_probability,
+        aggregation.arithmetic_mean
+    ),
+    'c_v': _make_pipeline(
+        segmentation.s_one_set,
+        probability_estimation.p_boolean_sliding_window,
+        indirect_confirmation_measure.cosine_similarity,
+        aggregation.arithmetic_mean
+    ),
+    'c_uci': _make_pipeline(
+        segmentation.s_one_one,
+        probability_estimation.p_boolean_sliding_window,
+        direct_confirmation_measure.log_ratio_measure,
+        aggregation.arithmetic_mean
+    ),
+    'c_npmi': _make_pipeline(
+        segmentation.s_one_one,
+        probability_estimation.p_boolean_sliding_window,
+        direct_confirmation_measure.log_ratio_measure,
+        aggregation.arithmetic_mean
+    ),
 }
 
 SLIDING_WINDOW_SIZES = {
@@ -102,8 +110,8 @@ class CoherenceModel(interfaces.TransformationABC):
 
     Model persistency is achieved via its load/save methods.
     """
-    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None,
-                 coherence='c_v', topn=10, processes=-1):
+    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None,
+                 window_size=None, coherence='c_v', topn=10, processes=-1):
         """
         Args:
         ----
@@ -152,8 +160,9 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
         # Check if associated dictionary is provided.
         if dictionary is None:
             if isinstance(model.id2word, FakeDict):
-                raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
-                                 " should be set as the associated dictionary.")
+                raise ValueError(
+                    "The associated dictionary should be provided with the corpus or 'id2word'"
+                    " for topic model should be set as the associated dictionary.")
             else:
                 self.dictionary = model.id2word
         else:
@@ -168,7 +177,9 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                 self.texts = texts
                 self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
             else:
-                raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
+                raise ValueError(
+                    "Either 'corpus' with 'dictionary' or 'texts' should "
+                    "be provided for %s coherence.", coherence)
 
         # Check for correct inputs for c_v coherence measure.
         elif coherence in sliding_window_based:
@@ -176,11 +187,11 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
             if self.window_size is None:
                 self.window_size = SLIDING_WINDOW_SIZES[self.coherence]
             if texts is None:
-                raise ValueError("'texts' should be provided for %s coherence." % coherence)
+                raise ValueError("'texts' should be provided for %s coherence.", coherence)
             else:
                 self.texts = texts
         else:
-            raise ValueError("%s coherence is not currently supported." % coherence)
+            raise ValueError("%s coherence is not currently supported.", coherence)
 
         self.topn = topn
         self._model = model
@@ -245,8 +256,8 @@ def _relevant_ids_will_differ(self, new_topics):
 
     def _topics_differ(self, new_topics):
         return (new_topics is not None and
-                    self._topics is not None and
-                    not np.array_equal(new_topics, self._topics))
+                self._topics is not None and
+                not np.array_equal(new_topics, self._topics))
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
@@ -264,8 +275,8 @@ def _get_topics(self):
                 bestn = argsort(topic, topn=self.topn, reverse=True)
                 topics.append(bestn)
         else:
-            raise ValueError("This topic model is not currently supported. Supported topic models are"
-                             "LdaModel, LdaVowpalWabbit and LdaMallet.")
+            raise ValueError("This topic model is not currently supported. Supported topic models "
+                             " are LdaModel, LdaVowpalWabbit and LdaMallet.")
         return topics
 
     def segment_topics(self):
diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
index c32e6b2ebd..e6f4aba86b 100644
--- a/gensim/test/test_text_analysis.py
+++ b/gensim/test/test_text_analysis.py
@@ -1,10 +1,10 @@
 import logging
 import unittest
 
+from gensim.corpora.dictionary import Dictionary
 from gensim.topic_coherence.text_analysis import \
     InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \
     CorpusAccumulator
-from gensim.corpora.dictionary import Dictionary
 
 
 class BaseTestCases(object):
@@ -28,16 +28,18 @@ class TextAnalyzerTestBase(unittest.TestCase):
         dictionary.id2token = {v: k for k, v in token2id.items()}
         top_ids = set(token2id.values())
 
-        texts2 = [['human', 'interface', 'computer'],
-                  ['survey', 'user', 'computer', 'system', 'response', 'time'],
-                  ['eps', 'user', 'interface', 'system'],
-                  ['system', 'human', 'system', 'eps'],
-                  ['user', 'response', 'time'],
-                  ['trees'],
-                  ['graph', 'trees'],
-                  ['graph', 'minors', 'trees'],
-                  ['graph', 'minors', 'survey'],
-                  ['user', 'user']]
+        texts2 = [
+            ['human', 'interface', 'computer'],
+            ['survey', 'user', 'computer', 'system', 'response', 'time'],
+            ['eps', 'user', 'interface', 'system'],
+            ['system', 'human', 'system', 'eps'],
+            ['user', 'response', 'time'],
+            ['trees'],
+            ['graph', 'trees'],
+            ['graph', 'minors', 'trees'],
+            ['graph', 'minors', 'survey'],
+            ['user', 'user']
+        ]
         dictionary2 = Dictionary(texts2)
         dictionary2.id2token = {v: k for k, v in dictionary2.token2id.items()}
         top_ids2 = set(dictionary2.token2id.values())
@@ -51,8 +53,7 @@ def init_accumulator2(self):
             return self.accumulator_cls(self.top_ids2, self.dictionary2)
 
         def test_occurrence_counting(self):
-            accumulator = self.init_accumulator()\
-                .accumulate(self.texts, 3)
+            accumulator = self.init_accumulator().accumulate(self.texts, 3)
             self.assertEqual(3, accumulator.get_occurrences("this"))
             self.assertEqual(1, accumulator.get_occurrences("is"))
             self.assertEqual(1, accumulator.get_occurrences("a"))
@@ -62,8 +63,7 @@ def test_occurrence_counting(self):
             self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))
 
         def test_occurrence_counting2(self):
-            accumulator = self.init_accumulator2()\
-                .accumulate(self.texts2, 110)
+            accumulator = self.init_accumulator2().accumulate(self.texts2, 110)
             self.assertEqual(2, accumulator.get_occurrences("human"))
             self.assertEqual(4, accumulator.get_occurrences("user"))
             self.assertEqual(3, accumulator.get_occurrences("graph"))
@@ -90,8 +90,7 @@ def test_occurrence_counting2(self):
                 self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1))
 
         def test_occurences_for_irrelevant_words(self):
-            accumulator = self.init_accumulator() \
-                .accumulate(self.texts, 2)
+            accumulator = self.init_accumulator().accumulate(self.texts, 2)
             with self.assertRaises(KeyError):
                 accumulator.get_occurrences("irrelevant")
             with self.assertRaises(KeyError):
@@ -117,7 +116,7 @@ def test_accumulate1(self):
         self.assertDictEqual(expected, inverted_index)
 
     def test_accumulate2(self):
-        accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary) \
+        accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\
             .accumulate(self.texts, 3)
         # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document'],
         #  ['test', 'test', 'this']
@@ -151,12 +150,11 @@ class TestCorpusAnalyzer(unittest.TestCase):
     def setUp(self):
         self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary
         self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids
-        self.corpus = [self.dictionary.doc2bow(doc)
-                       for doc in BaseTestCases.TextAnalyzerTestBase.texts]
+        self.corpus = \
+            [self.dictionary.doc2bow(doc) for doc in BaseTestCases.TextAnalyzerTestBase.texts]
 
     def test_index_accumulation(self):
-        accumulator = CorpusAccumulator(self.top_ids)\
-            .accumulate(self.corpus)
+        accumulator = CorpusAccumulator(self.top_ids).accumulate(self.corpus)
         inverted_index = accumulator.index_to_dict()
         expected = {
             10: {0, 2, 3},
diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
index 60631375ef..29f68ad56e 100644
--- a/gensim/topic_coherence/direct_confirmation_measure.py
+++ b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -9,6 +9,7 @@
 """
 
 import logging
+
 import numpy as np
 
 logger = logging.getLogger(__name__)
@@ -24,7 +25,8 @@ def log_conditional_probability(segmented_topics, accumulator):
 
     Args:
     ----
-    segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    segmented_topics : Output from the segmentation module of the segmented topics.
+                       Is a list of list of tuples.
     accumulator: word occurrence accumulator from probability_estimation.
 
     Returns:
@@ -62,7 +64,8 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False):
 
     Args:
     ----
-    segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    segmented topics : Output from the segmentation module of the segmented topics.
+                       Is a list of list of tuples.
     accumulator: word occurrence accumulator from probability_estimation.
 
     Returns:
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index eccfb0a3b5..8321656067 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -5,12 +5,13 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 """
-This module contains functions to compute confirmation on a pair of words or word subsets. The advantage of indirect
-confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words.
-Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are
-seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road”
-or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation
-measures may capture semantic support that direct measures would miss.
+This module contains functions to compute confirmation on a pair of words or word subsets.
+The advantage of indirect confirmation measure is that it computes similarity of words in W' and
+W* with respect to direct confirmations to all words. Eg. Suppose x and z are both competing
+brands of cars, which semantically support each other. However, both brands are seldom mentioned
+together in documents in the reference corpus. But their confirmations to other words like “road”
+or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure.
+Thus, indirect confirmation measures may capture semantic support that direct measures would miss.
 
 The formula used to compute indirect confirmation measure is:
 
@@ -23,11 +24,11 @@
 Here 'm' is the direct confirmation measure used.
 """
 
-import logging
 import itertools
+import logging
 
-import scipy.sparse as sps
 import numpy as np
+import scipy.sparse as sps
 
 from gensim.topic_coherence import direct_confirmation_measure
 
@@ -48,10 +49,13 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm
 
     Args:
     ----
-    segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module).
+    segmented_topics : Output from the segmentation module of the segmented topics.
+                       Is a list of list of tuples.
+    accumulator : Output from the probability_estimation module.
+                  Is an accumulator of word occurrences (see text_analysis module).
     topics : Topics obtained from the trained topic model.
-    measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
+    measure : String. Direct confirmation measure to be used.
+              Supported values are "nlr" (normalized log ratio).
     gamma : Gamma value for computing W', W* vectors; default is 1.
 
     Returns:
@@ -78,13 +82,14 @@ def __init__(self, measure, topics, accumulator, gamma):
         if measure == 'nlr':
             self.similarity = _pair_npmi
         else:
-            raise ValueError("The direct confirmation measure you entered is not currently supported.")
+            raise ValueError(
+                "The direct confirmation measure you entered is not currently supported.")
 
         self.mapping = _map_to_contiguous(topics)
         self.vocab_size = len(self.mapping)
         self.accumulator = accumulator
         self.gamma = gamma
-        self.sim_cache = {}  # Cache similarities between tokens represented as pairs of word ids, e.g. (1, 2)
+        self.sim_cache = {}  # Cache similarities between tokens (pairs of word ids), e.g. (1, 2)
         self.context_vector_cache = {}  # mapping from (segment, topic_words) --> context_vector
 
     def __getitem__(self, idx):
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index fb583b99fc..0c62d68985 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -8,8 +8,8 @@
 This module contains functions to perform segmentation on a list of topics.
 """
 
-import logging
 import itertools
+import logging
 
 from gensim.topic_coherence.text_analysis import \
     CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
@@ -60,7 +60,7 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
         accumulator = WordOccurrenceAccumulator(top_ids, dictionary)
     else:
         accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary)
-    logger.info("using %s to estimate probabilities from sliding windows" % accumulator)
+    logger.info("using %s to estimate probabilities from sliding windows", accumulator)
     return accumulator.accumulate(texts, window_size)
 
 
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 90d7d83467..1b21334178 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -9,10 +9,10 @@
 statistical information about word occurrences.
 """
 
-import sys
-import logging
 import itertools
+import logging
 import multiprocessing as mp
+import sys
 from collections import Counter
 
 import numpy as np
@@ -65,8 +65,9 @@ def num_docs(self):
     def num_docs(self, num):
         self._num_docs = num
         if self._num_docs % self.log_every == 0:
-            logger.info("%s accumulated stats from %d documents",
-                        self.__class__.__name__, self._num_docs)
+            logger.info(
+                "%s accumulated stats from %d documents",
+                self.__class__.__name__, self._num_docs)
 
     def analyze_text(self, text, doc_num=None):
         raise NotImplementedError("Base classes should implement analyze_text.")
@@ -143,7 +144,7 @@ def _get_co_occurrences(self, word_id1, word_id2):
 
     def index_to_dict(self):
         contiguous2id = {n: word_id for word_id, n in viewitems(self.id2contiguous)}
-        return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)}
+        return {contiguous2id[n]: doc_id_set for n, doc_id_set in enumerate(self._inverted_index)}
 
 
 class CorpusAccumulator(InvertedIndexBased):
@@ -177,8 +178,9 @@ def __init__(self, relevant_ids, dictionary):
 
     def accumulate(self, texts, window_size):
         relevant_texts = self._iter_texts(texts)
-        windows = utils.iter_windows(relevant_texts, window_size, ignore_below_size=False,
-                                     include_doc_num=True)
+        windows = utils.iter_windows(
+            relevant_texts, window_size, ignore_below_size=False, include_doc_num=True)
+
         for doc_num, virtual_document in windows:
             self.analyze_text(virtual_document, doc_num)
             self.num_docs += 1
@@ -307,7 +309,8 @@ def __init__(self, processes, *args, **kwargs):
         """
         super(ParallelWordOccurrenceAccumulator, self).__init__(*args)
         if processes < 2:
-            raise ValueError("Must have at least 2 processes to run in parallel; got %d", processes)
+            raise ValueError(
+                "Must have at least 2 processes to run in parallel; got %d" % processes)
         self.processes = processes
         self.batch_size = kwargs.get('batch_size', 64)
 
@@ -321,8 +324,7 @@ def accumulate(self, texts, window_size):
             self.queue_all_texts(input_q, texts, window_size)
             interrupted = False
         except KeyboardInterrupt:
-            logger.warn("stats accumulation interrupted; <= %d documents processed",
-                        self._num_docs)
+            logger.warn("stats accumulation interrupted; <= %d documents processed", self._num_docs)
             interrupted = True
 
         accumulators = self.terminate_workers(input_q, output_q, workers, interrupted)
@@ -414,8 +416,9 @@ def merge_accumulators(self, accumulators):
         # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized.
         # This is by design, to avoid unnecessary matrix additions/conversions during accumulation.
         accumulator._symmetrize()
-        logger.info("accumulated word occurrence stats for %d virtual documents",
-                    accumulator.num_docs)
+        logger.info(
+            "accumulated word occurrence stats for %d virtual documents",
+            accumulator.num_docs)
         return accumulator
 
 
@@ -434,8 +437,9 @@ def run(self):
         try:
             self._run()
         except KeyboardInterrupt:
-            logger.info("%s interrupted after processing %d documents",
-                        self.__class__.__name__, self.accumulator.num_docs)
+            logger.info(
+                "%s interrupted after processing %d documents",
+                self.__class__.__name__, self.accumulator.num_docs)
         except:
             logger.exception("worker encountered unexpected exception")
         finally:
@@ -453,11 +457,13 @@ def _run(self):
 
             self.accumulator.partial_accumulate(docs, self.window_size)
             n_docs += len(docs)
-            logger.debug("completed batch %d; %d documents processed (%d virtual)",
-                         batch_num, n_docs, self.accumulator.num_docs)
+            logger.debug(
+                "completed batch %d; %d documents processed (%d virtual)",
+                batch_num, n_docs, self.accumulator.num_docs)
 
-        logger.debug("finished all batches; %d documents processed (%d virtual)",
-                     n_docs, self.accumulator.num_docs)
+        logger.debug(
+            "finished all batches; %d documents processed (%d virtual)",
+            n_docs, self.accumulator.num_docs)
 
     def reply_to_master(self):
         logger.info("serializing accumulator to return to master...")

From 5f58bdae633a003a1157655b275672dcde1a61f0 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Tue, 6 Jun 2017 14:04:21 -0400
Subject: [PATCH 30/33] #1342: Fix
 `indirect_confirmation_measure.cosine_similarity` to return individual topic
 coherence values, then average those. Make the
 `ParallelWordOccurrenceAccumulator` return a `WordOccurrenceAccumulator`
 after accumulation, so it can be trained further afterwards if desired.

---
 gensim/test/test_coherencemodel.py                | 15 +++++++--------
 gensim/test/test_indirect_confirmation.py         | 11 +++++------
 .../indirect_confirmation_measure.py              |  6 ++++--
 gensim/topic_coherence/text_analysis.py           |  4 ++--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 426a6ef71c..d055523dff 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -8,19 +8,19 @@
 Automated tests for checking transformation algorithms (the models package).
 """
 
-import os
 import logging
-import unittest
+import os
 import tempfile
+import unittest
 
 import numpy as np
 
+from gensim.corpora.dictionary import Dictionary
+from gensim.matutils import argsort
 from gensim.models.coherencemodel import CoherenceModel, boolean_document_based
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaMallet
 from gensim.models.wrappers import LdaVowpalWabbit
-from gensim.corpora.dictionary import Dictionary
-from gensim.matutils import argsort
 
 
 def testfile():
@@ -76,12 +76,11 @@ def check_coherence_measure(self, coherence):
         """Check provided topic coherence algorithm on given topics"""
         if coherence in boolean_document_based:
             kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence)
-            cm1 = CoherenceModel(topics=self.topics1, **kwargs)
-            cm2 = CoherenceModel(topics=self.topics2, **kwargs)
         else:
             kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence)
-            cm1 = CoherenceModel(topics=self.topics1, **kwargs)
-            cm2 = CoherenceModel(topics=self.topics2, **kwargs)
+
+        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
+        cm2 = CoherenceModel(topics=self.topics2, **kwargs)
         self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
 
     def testUMass(self):
diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py
index aedd9eaa9a..e78d32ac58 100644
--- a/gensim/test/test_indirect_confirmation.py
+++ b/gensim/test/test_indirect_confirmation.py
@@ -11,11 +11,11 @@
 import logging
 import unittest
 
+import numpy as np
+
+from gensim.corpora.dictionary import Dictionary
 from gensim.topic_coherence import indirect_confirmation_measure
 from gensim.topic_coherence import text_analysis
-from gensim.corpora.dictionary import Dictionary
-
-import numpy as np
 
 
 class TestIndirectConfirmation(unittest.TestCase):
@@ -46,9 +46,8 @@ def testCosineSimilarity(self):
         # 4. Calculate nlr(1, 1) + nlr(2, 1). Calculate nlr(1, 2), nlr(2, 2). This is our second vector.
         # 5. Find out cosine similarity between these two vectors.
         # 6. Similarly for the second segmentation.
-        expected = [0.6230, 0.6230]  # To account for EPSILON approximation
-        for i in range(len(expected)):
-            self.assertAlmostEqual(obtained[i], expected[i], 4)
+        expected = (0.6230 + 0.6230) / 2.  # To account for EPSILON approximation
+        self.assertAlmostEqual(expected, obtained[0], 4)
 
 
 if __name__ == '__main__':
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index 8321656067..a05676ab61 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -67,10 +67,12 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm
     s_cos_sim = []
     for topic_words, topic_segments in zip(topics, segmented_topics):
         topic_words = tuple(topic_words)  # because tuples are hashable
-        for w_prime, w_star in topic_segments:
+        segment_sims = np.zeros(len(topic_segments))
+        for i, (w_prime, w_star) in enumerate(topic_segments):
             w_prime_cv = context_vectors[w_prime, topic_words]
             w_star_cv = context_vectors[w_star, topic_words]
-            s_cos_sim.append(_cossim(w_prime_cv, w_star_cv))
+            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)
+        s_cos_sim.append(np.mean(segment_sims))
 
     return s_cos_sim
 
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 1b21334178..a44e57fb3e 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -410,8 +410,8 @@ def merge_accumulators(self, accumulators):
         occurrence and co-occurrence counts, and a `num_docs` that reflects the total observed
         by all the individual accumulators.
         """
-        accumulator = accumulators[0]
-        for other_accumulator in accumulators[1:]:
+        accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary)
+        for other_accumulator in accumulators:
             accumulator.merge(other_accumulator)
         # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized.
         # This is by design, to avoid unnecessary matrix additions/conversions during accumulation.

From b941f3c25374a8ad9e200567f868e6fe1f06ce4d Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Wed, 7 Jun 2017 10:15:47 -0400
Subject: [PATCH 31/33] #1342: Fix `direct_confirmation_measure` functions to
 return individual topic coherence values, then average those.

---
 gensim/test/test_direct_confirmation.py             |  9 ++++++---
 .../topic_coherence/direct_confirmation_measure.py  | 13 +++++++++----
 .../indirect_confirmation_measure.py                |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py
index ad39b99b62..1d4f701cc9 100644
--- a/gensim/test/test_direct_confirmation.py
+++ b/gensim/test/test_direct_confirmation.py
@@ -33,21 +33,24 @@ def setUp(self):
 
     def testLogConditionalProbability(self):
         """Test log_conditional_probability()"""
-        obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.accumulator)[0]
+        obtained = direct_confirmation_measure.log_conditional_probability(
+            self.segmentation, self.accumulator)[0]
         # Answer should be ~ ln(1 / 2) = -0.693147181
         expected = -0.693147181
         self.assertAlmostEqual(obtained, expected)
 
     def testLogRatioMeasure(self):
         """Test log_ratio_measure()"""
-        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator)[0]
+        obtained = direct_confirmation_measure.log_ratio_measure(
+            self.segmentation, self.accumulator)[0]
         # Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
         expected = -0.182321557
         self.assertAlmostEqual(obtained, expected)
 
     def testNormalizedLogRatioMeasure(self):
         """Test normalized_log_ratio_measure()"""
-        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator, normalize=True)[0]
+        obtained = direct_confirmation_measure.log_ratio_measure(
+            self.segmentation, self.accumulator, normalize=True)[0]
         # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753
         expected = -0.113282753
         self.assertAlmostEqual(obtained, expected)
diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
index 29f68ad56e..467d134f29 100644
--- a/gensim/topic_coherence/direct_confirmation_measure.py
+++ b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -31,11 +31,12 @@ def log_conditional_probability(segmented_topics, accumulator):
 
     Returns:
     -------
-    m_lc : List of log conditional probability measure on each set in segmented topics.
+    m_lc : List of log conditional probability measure for each topic.
     """
     m_lc = []
     num_docs = float(accumulator.num_docs)
     for s_i in segmented_topics:
+        segment_sims = []
         for w_prime, w_star in s_i:
             try:
                 w_star_count = accumulator[w_star]
@@ -44,7 +45,8 @@ def log_conditional_probability(segmented_topics, accumulator):
             except KeyError:
                 m_lc_i = 0.0
 
-            m_lc.append(m_lc_i)
+            segment_sims.append(m_lc_i)
+        m_lc.append(np.mean(segment_sims))
 
     return m_lc
 
@@ -70,11 +72,12 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False):
 
     Returns:
     -------
-    m_lr : List of log ratio measures on each set in segmented topics.
+    m_lr : List of log ratio measures for each topic.
     """
     m_lr = []
     num_docs = float(accumulator.num_docs)
     for s_i in segmented_topics:
+        segment_sims = []
         for w_prime, w_star in s_i:
             w_prime_count = accumulator[w_prime]
             w_star_count = accumulator[w_star]
@@ -90,6 +93,8 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False):
                 numerator = (co_occur_count / num_docs) + EPSILON
                 denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
                 m_lr_i = np.log(numerator / denominator)
-            m_lr.append(m_lr_i)
+
+            segment_sims.append(m_lr_i)
+        m_lr.append(np.mean(segment_sims))
 
     return m_lr
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index a05676ab61..07f221e941 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -60,7 +60,7 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm
 
     Returns:
     -------
-    s_cos_sim : array of cosine similarity of the context vectors for each segmentation
+    s_cos_sim : list of indirect cosine similarity measure for each topic.
     """
     context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)
 

From 75fcac8c90ce17d5b315f1e422becc8cf8b64764 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Thu, 8 Jun 2017 13:15:28 -0400
Subject: [PATCH 32/33] #1342: Hanging indents and switch out `union` with
 `update` for unique ids from topic segments.

---
 gensim/test/test_coherencemodel.py            | 73 +++++++++++--------
 gensim/test/test_probability_estimation.py    | 18 +++--
 .../topic_coherence/probability_estimation.py |  2 +-
 3 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index d055523dff..039db55a48 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -31,15 +31,17 @@ def testfile():
 class TestCoherenceModel(unittest.TestCase):
 
     # set up vars used in testing ("Deerwester" from the web tutorial)
-    texts = [['human', 'interface', 'computer'],
-             ['survey', 'user', 'computer', 'system', 'response', 'time'],
-             ['eps', 'user', 'interface', 'system'],
-             ['system', 'human', 'system', 'eps'],
-             ['user', 'response', 'time'],
-             ['trees'],
-             ['graph', 'trees'],
-             ['graph', 'minors', 'trees'],
-             ['graph', 'minors', 'survey']]
+    texts = [
+        ['human', 'interface', 'computer'],
+        ['survey', 'user', 'computer', 'system', 'response', 'time'],
+        ['eps', 'user', 'interface', 'system'],
+        ['system', 'human', 'system', 'eps'],
+        ['user', 'response', 'time'],
+        ['trees'],
+        ['graph', 'trees'],
+        ['graph', 'minors', 'trees'],
+        ['graph', 'minors', 'survey']
+    ]
     dictionary = Dictionary(texts)
 
     @classmethod
@@ -55,22 +57,28 @@ def setUp(self):
                         ['graph', 'minors', 'trees', 'eps']]
         self.topics2 = [['user', 'graph', 'minors', 'system'],
                         ['time', 'graph', 'survey', 'minors']]
-        self.ldamodel = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=2,
-                                 passes=0, iterations=0)
+        self.ldamodel = LdaModel(
+            corpus=self.corpus, id2word=self.dictionary, num_topics=2,
+            passes=0, iterations=0)
+
         mallet_home = os.environ.get('MALLET_HOME', None)
         self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
         if self.mallet_path:
-            self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=self.corpus,
-                                         id2word=self.dictionary, num_topics=2, iterations=0)
+            self.malletmodel = LdaMallet(
+                mallet_path=self.mallet_path, corpus=self.corpus,
+                id2word=self.dictionary, num_topics=2, iterations=0)
+
         vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
         if not vw_path:
-            logging.info("Environment variable 'VOWPAL_WABBIT_PATH' not specified,"
-                         " skipping sanity checks for LDA Model")
+            logging.info(
+                "Environment variable 'VOWPAL_WABBIT_PATH' not specified,"
+                " skipping sanity checks for LDA Model")
             self.vw_path = None
         else:
             self.vw_path = vw_path
-            self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=self.corpus,
-                                           id2word=self.dictionary, num_topics=2, passes=0)
+            self.vwmodel = LdaVowpalWabbit(
+                self.vw_path, corpus=self.corpus, id2word=self.dictionary,
+                num_topics=2, passes=0)
 
     def check_coherence_measure(self, coherence):
         """Check provided topic coherence algorithm on given topics"""
@@ -169,35 +177,38 @@ def testCnpmiVWModel(self):
     def testErrors(self):
         """Test if errors are raised on bad input"""
         # not providing dictionary
-        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
-                          coherence='u_mass')
+        self.assertRaises(
+            ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
+            coherence='u_mass')
         # not providing texts for c_v and instead providing corpus
-        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
-                          dictionary=self.dictionary, coherence='c_v')
+        self.assertRaises(
+            ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
+            dictionary=self.dictionary, coherence='c_v')
         # not providing corpus or texts for u_mass
-        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1,
-                          dictionary=self.dictionary, coherence='u_mass')
+        self.assertRaises(
+            ValueError, CoherenceModel, topics=self.topics1, dictionary=self.dictionary,
+            coherence='u_mass')
 
     def testPersistence(self):
         fname = testfile()
-        model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary,
-                               coherence='u_mass')
+        model = CoherenceModel(
+            topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
         model.save(fname)
         model2 = CoherenceModel.load(fname)
         self.assertTrue(model.get_coherence() == model2.get_coherence())
 
     def testPersistenceCompressed(self):
         fname = testfile() + '.gz'
-        model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary,
-                               coherence='u_mass')
+        model = CoherenceModel(
+            topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
         model.save(fname)
         model2 = CoherenceModel.load(fname)
         self.assertTrue(model.get_coherence() == model2.get_coherence())
 
     def testPersistenceAfterProbabilityEstimationUsingCorpus(self):
         fname = testfile()
-        model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary,
-                               coherence='u_mass')
+        model = CoherenceModel(
+            topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
         model.estimate_probabilities()
         model.save(fname)
         model2 = CoherenceModel.load(fname)
@@ -206,8 +217,8 @@ def testPersistenceAfterProbabilityEstimationUsingCorpus(self):
 
     def testPersistenceAfterProbabilityEstimationUsingTexts(self):
         fname = testfile()
-        model = CoherenceModel(topics=self.topics1, texts=self.texts, dictionary=self.dictionary,
-                               coherence='c_v')
+        model = CoherenceModel(
+            topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v')
         model.estimate_probabilities()
         model.save(fname)
         model2 = CoherenceModel.load(fname)
diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py
index 982230a526..1e674415f3 100644
--- a/gensim/test/test_probability_estimation.py
+++ b/gensim/test/test_probability_estimation.py
@@ -11,20 +11,22 @@
 import logging
 import unittest
 
-from gensim.topic_coherence import probability_estimation
-from gensim.corpora.hashdictionary import HashDictionary
 from gensim.corpora.dictionary import Dictionary
+from gensim.corpora.hashdictionary import HashDictionary
+from gensim.topic_coherence import probability_estimation
 
 
 class BaseTestCases(object):
 
     class ProbabilityEstimationBase(unittest.TestCase):
-        texts = [['human', 'interface', 'computer'],
-                 ['eps', 'user', 'interface', 'system'],
-                 ['system', 'human', 'system', 'eps'],
-                 ['user', 'response', 'time'],
-                 ['trees'],
-                 ['graph', 'trees']]
+        texts = [
+            ['human', 'interface', 'computer'],
+            ['eps', 'user', 'interface', 'system'],
+            ['system', 'human', 'system', 'eps'],
+            ['user', 'response', 'time'],
+            ['trees'],
+            ['graph', 'trees']
+        ]
         dictionary = None
 
         def build_segmented_topics(self):
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index 0c62d68985..552fe5c4d7 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -70,7 +70,7 @@ def unique_ids_from_segments(segmented_topics):
     for s_i in segmented_topics:
         for word_id in itertools.chain.from_iterable(s_i):
             if hasattr(word_id, '__iter__'):
-                top_ids = top_ids.union(word_id)
+                top_ids.update(word_id)
             else:
                 top_ids.add(word_id)
 

From 96d1349691b3729f2ae66f4e71c818a5cc1169db Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Fri, 9 Jun 2017 09:15:38 -0400
Subject: [PATCH 33/33] #1342: Clarify documentation in the
 `probability_estimation` module.

---
 .../topic_coherence/probability_estimation.py | 42 +++++++++++--------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index 552fe5c4d7..85e787de18 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -18,9 +18,9 @@
 
 
 def p_boolean_document(corpus, segmented_topics):
-    """
-    This function performs the boolean document probability estimation. Boolean document estimates the probability
-    of a single word as the number of documents in which the word occurs divided by the total number of documents.
+    """This function performs the boolean document probability estimation.
+    Boolean document estimates the probability of a single word as the number
+    of documents in which the word occurs divided by the total number of documents.
 
     Args:
     ----
@@ -29,19 +29,19 @@ def p_boolean_document(corpus, segmented_topics):
 
     Returns:
     -------
-    per_topic_postings : Boolean document posting list for each unique topic id.
-    num_docs : Total number of documents in corpus.
+    accumulator : word occurrence accumulator instance that can be used to lookup token
+                  frequencies and co-occurrence frequencies.
     """
     top_ids = unique_ids_from_segments(segmented_topics)
     return CorpusAccumulator(top_ids).accumulate(corpus)
 
 
 def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1):
-    """
-    This function performs the boolean sliding window probability estimation. Boolean sliding window
-    determines word counts using a sliding window. The window moves over the documents one word token per step.
-    Each step defines a new virtual document by copying the window content. Boolean document is applied to
-    these virtual documents to compute word probabilities.
+    """This function performs the boolean sliding window probability estimation.
+    Boolean sliding window determines word counts using a sliding window. The window
+    moves over  the documents one word token per step. Each step defines a new virtual
+    document  by copying the window content. Boolean document is applied to these virtual
+    documents to compute word probabilities.
 
     Args:
     ----
@@ -52,8 +52,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
 
     Returns:
     -------
-    per_topic_postings : Boolean sliding window postings list of all the unique topic ids.
-    window_id[0] : Total no of windows
+    accumulator : word occurrence accumulator instance that can be used to lookup token
+                  frequencies and co-occurrence frequencies.
     """
     top_ids = unique_ids_from_segments(segmented_topics)
     if processes <= 1:
@@ -65,13 +65,21 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
 
 
 def unique_ids_from_segments(segmented_topics):
-    """Return the set of all unique ids in a list of segmented topics."""
-    top_ids = set()  # is a set of all the unique ids contained in topics.
+    """Return the set of all unique ids in a list of segmented topics.
+
+    Args:
+    ----
+    segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set
+                      is either a single integer, or a `numpy.ndarray` of integers.
+    Returns:
+    unique_ids : set of unique ids across all topic segments.
+    """
+    unique_ids = set()  # is a set of all the unique ids contained in topics.
     for s_i in segmented_topics:
         for word_id in itertools.chain.from_iterable(s_i):
             if hasattr(word_id, '__iter__'):
-                top_ids.update(word_id)
+                unique_ids.update(word_id)
             else:
-                top_ids.add(word_id)
+                unique_ids.add(word_id)
 
-    return top_ids
+    return unique_ids