From 96fd3433ec124b0be0462e14309dfd27c4b580f1 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Sun, 4 Jun 2017 15:52:08 -0400 Subject: [PATCH] #1342: Add test case for `CorpusAccumulator`. --- gensim/test/test_text_analysis.py | 30 ++++++++++++++++++++++++- gensim/topic_coherence/text_analysis.py | 5 ++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index ed6d482b44..c32e6b2ebd 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -2,7 +2,8 @@ import unittest from gensim.topic_coherence.text_analysis import \ - InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator + InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \ + CorpusAccumulator from gensim.corpora.dictionary import Dictionary @@ -145,6 +146,33 @@ def init_accumulator2(self): return self.accumulator_cls(2, self.top_ids2, self.dictionary2) +class TestCorpusAnalyzer(unittest.TestCase): + + def setUp(self): + self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary + self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids + self.corpus = [self.dictionary.doc2bow(doc) + for doc in BaseTestCases.TextAnalyzerTestBase.texts] + + def test_index_accumulation(self): + accumulator = CorpusAccumulator(self.top_ids)\ + .accumulate(self.corpus) + inverted_index = accumulator.index_to_dict() + expected = { + 10: {0, 2, 3}, + 15: {0}, + 20: {0}, + 21: {1, 2, 3}, + 17: {1, 2} + } + self.assertDictEqual(expected, inverted_index) + + self.assertEqual(3, accumulator.get_occurrences(10)) + self.assertEqual(2, accumulator.get_occurrences(17)) + self.assertEqual(2, accumulator.get_co_occurrences(10, 21)) + self.assertEqual(1, accumulator.get_co_occurrences(10, 17)) + + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 8cdf1027fd..90d7d83467 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -152,9 +152,8 @@ class CorpusAccumulator(InvertedIndexBased): def analyze_text(self, text, doc_num=None): doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) - if len(top_ids_in_doc) > 0: - for word_id in top_ids_in_doc: - self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs) + for word_id in top_ids_in_doc: + self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs) def accumulate(self, corpus): for document in corpus: