Skip to content

Commit

Permalink
piskvorky#1342: Add test case for CorpusAccumulator.
Browse files Browse the repository at this point in the history
  • Loading branch information
Sweeney, Mack committed Jun 4, 2017
1 parent 1ce8a72 commit 96fd343
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 4 deletions.
30 changes: 29 additions & 1 deletion gensim/test/test_text_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import unittest

from gensim.topic_coherence.text_analysis import \
InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \
CorpusAccumulator
from gensim.corpora.dictionary import Dictionary


Expand Down Expand Up @@ -145,6 +146,33 @@ def init_accumulator2(self):
return self.accumulator_cls(2, self.top_ids2, self.dictionary2)


class TestCorpusAnalyzer(unittest.TestCase):

def setUp(self):
self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary
self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids
self.corpus = [self.dictionary.doc2bow(doc)
for doc in BaseTestCases.TextAnalyzerTestBase.texts]

def test_index_accumulation(self):
accumulator = CorpusAccumulator(self.top_ids)\
.accumulate(self.corpus)
inverted_index = accumulator.index_to_dict()
expected = {
10: {0, 2, 3},
15: {0},
20: {0},
21: {1, 2, 3},
17: {1, 2}
}
self.assertDictEqual(expected, inverted_index)

self.assertEqual(3, accumulator.get_occurrences(10))
self.assertEqual(2, accumulator.get_occurrences(17))
self.assertEqual(2, accumulator.get_co_occurrences(10, 21))
self.assertEqual(1, accumulator.get_co_occurrences(10, 17))


if __name__ == '__main__':
logging.root.setLevel(logging.WARNING)
unittest.main()
5 changes: 2 additions & 3 deletions gensim/topic_coherence/text_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,8 @@ class CorpusAccumulator(InvertedIndexBased):
def analyze_text(self, text, doc_num=None):
doc_words = frozenset(x[0] for x in text)
top_ids_in_doc = self.relevant_ids.intersection(doc_words)
if len(top_ids_in_doc) > 0:
for word_id in top_ids_in_doc:
self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)
for word_id in top_ids_in_doc:
self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs)

def accumulate(self, corpus):
for document in corpus:
Expand Down

0 comments on commit 96fd343

Please sign in to comment.