piskvorky#1342: Add test case for CorpusAccumulator.

macks22 · Jun 4, 2017 · 96fd343 · 96fd343
1 parent 1ce8a72
commit 96fd343
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 4 deletions.
diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py
@@ -2,7 +2,8 @@
 import unittest
 
 from gensim.topic_coherence.text_analysis import \
-    InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
+    InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \
+    CorpusAccumulator
 from gensim.corpora.dictionary import Dictionary
 
 
@@ -145,6 +146,33 @@ def init_accumulator2(self):
         return self.accumulator_cls(2, self.top_ids2, self.dictionary2)
 
 
+class TestCorpusAnalyzer(unittest.TestCase):
+
+    def setUp(self):
+        self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary
+        self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids
+        self.corpus = [self.dictionary.doc2bow(doc)
+                       for doc in BaseTestCases.TextAnalyzerTestBase.texts]
+
+    def test_index_accumulation(self):
+        accumulator = CorpusAccumulator(self.top_ids)\
+            .accumulate(self.corpus)
+        inverted_index = accumulator.index_to_dict()
+        expected = {
+            10: {0, 2, 3},
+            15: {0},
+            20: {0},
+            21: {1, 2, 3},
+            17: {1, 2}
+        }
+        self.assertDictEqual(expected, inverted_index)
+
+        self.assertEqual(3, accumulator.get_occurrences(10))
+        self.assertEqual(2, accumulator.get_occurrences(17))
+        self.assertEqual(2, accumulator.get_co_occurrences(10, 21))
+        self.assertEqual(1, accumulator.get_co_occurrences(10, 17))
+
+
 if __name__ == '__main__':
     logging.root.setLevel(logging.WARNING)
     unittest.main()
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
@@ -152,9 +152,8 @@ class CorpusAccumulator(InvertedIndexBased):
     def analyze_text(self, text, doc_num=None):
         doc_words = frozenset(x[0] for x in text)
         top_ids_in_doc = self.relevant_ids.intersection(doc_words)
-        if len(top_ids_in_doc) > 0:
-            for word_id in top_ids_in_doc:
-                self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)
+        for word_id in top_ids_in_doc:
+            self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)
 
     def accumulate(self, corpus):
         for document in corpus: