From 9d06a1fbc059656923e7213c29fab592195d6c10 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Mon, 22 May 2017 14:43:07 -0400 Subject: [PATCH 01/33] #1342: Allow use of truncated `Dictionary` for coherence calculation by avoiding lookup of tokens not in the topic token lists. --- gensim/models/coherencemodel.py | 49 ++++--- gensim/test/test_probability_estimation.py | 87 ++++++++++-- .../topic_coherence/probability_estimation.py | 125 ++++++++++++------ 3 files changed, 191 insertions(+), 70 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 161d0257a4..130c285822 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -19,6 +19,7 @@ """ import logging +import multiprocessing as mp from gensim import interfaces from gensim.topic_coherence import (segmentation, probability_estimation, @@ -89,7 +90,8 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10): + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, + coherence='c_v', topn=10): """ Args: ---- @@ -128,8 +130,10 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= raise ValueError("One of model or topics has to be provided.") elif topics is not None and dictionary is None: raise ValueError("dictionary has to be provided if topics are to be used.") + if texts is None and corpus is None: raise ValueError("One of texts or corpus has to be provided.") + # Check if associated dictionary is provided. if dictionary is None: if isinstance(model.id2word, FakeDict): @@ -139,6 +143,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.dictionary = model.id2word else: self.dictionary = dictionary + # Check for correct inputs for u_mass coherence measure. if coherence in boolean_document_based: if is_corpus(corpus)[0]: @@ -148,6 +153,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] else: raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) + # Check for correct inputs for c_v coherence measure. elif coherence in sliding_window_based: self.window_size = window_size @@ -157,6 +163,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.texts = texts else: raise ValueError("%s coherence is not currently supported." % coherence) + self.topn = topn self.model = model if model is not None: @@ -193,27 +200,29 @@ def _get_topics(self): "LdaModel, LdaVowpalWabbit and LdaMallet.") return topics - def get_coherence(self): - """ - Return coherence value based on pipeline parameters. - """ + def get_coherence_per_topic(self): measure = coherence_dict[self.coherence] segmented_topics = measure.seg(self.topics) + if self.coherence in boolean_document_based: per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics) - confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs) - elif self.coherence in sliding_window_based: - if self.window_size is not None: - self.window_size = sliding_windows_dict[self.coherence] - per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) - if self.coherence == 'c_v': - confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) - else: - if self.coherence == 'c_npmi': - normalize = True - else: - # For c_uci - normalize = False - confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) + return measure.conf(segmented_topics, per_topic_postings, num_docs) + + if self.window_size is not None: + self.window_size = sliding_windows_dict[self.coherence] + per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=self.window_size) + if self.coherence == 'c_v': + return measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) + else: + normalize = self.coherence == 'c_npmi' + return measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) + + def aggregate_measures(self, confirmed_measures): + measure = coherence_dict[self.coherence] return measure.aggr(confirmed_measures) + + def get_coherence(self): + """Return coherence value based on pipeline parameters.""" + confirmed_measures = self.get_coherence_per_topic() + return self.aggregate_measures(confirmed_measures) diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py index 596f91f65b..09d9ee071f 100644 --- a/gensim/test/test_probability_estimation.py +++ b/gensim/test/test_probability_estimation.py @@ -13,15 +13,20 @@ from gensim.topic_coherence import probability_estimation from gensim.corpora.hashdictionary import HashDictionary +from gensim.corpora.dictionary import Dictionary -class TestProbabilityEstimation(unittest.TestCase): + +class ProbabilityEstimationBase(unittest.TestCase): + texts = [['human', 'interface', 'computer'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees']] + + +class TestProbabilityEstimation(ProbabilityEstimationBase): def setUp(self): - self.texts = [['human', 'interface', 'computer'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees']] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, @@ -36,21 +41,77 @@ def setUp(self): # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: - self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]] + self.segmented_topics = [ + [ + (5798, 18451), + (10608, 18451), + (10608, 5798) + ], [ + (10608, 18451), + (12736, 18451), + (12736, 10608) + ] + ] def testPBooleanDocument(self): """Test p_boolean_document()""" # Unique topic ids are 5798, 10608, 12736 and 18451 obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) - expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])} - self.assertTrue(obtained == expected) + expected = {18451: {5}, 12736: {1, 3}, 5798: {1, 2}, 10608: {0}} + self.assertEqual(expected, obtained) + + def testPBooleanSlidingWindow(self): + """Test p_boolean_sliding_window()""" + # Test with window size as 2. window_id is zero indexed. + obtained, _ = probability_estimation.p_boolean_sliding_window( + self.texts, self.segmented_topics, self.dictionary, 2) + expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {11}, 5798: {4, 5, 6, 7}} + self.assertEqual(expected, obtained) + + +class TestProbabilityEstimationWithNormalDictionary(ProbabilityEstimationBase): + def setUp(self): + self.dictionary = Dictionary(self.texts) + self.dictionary.id2token = {v: k for k, v in self.dictionary.token2id.items()} + # Following is the mapping: + # {u'computer': 1, + # u'eps': 5, + # u'graph': 9, + # u'human': 2, + # u'interface': 0, + # u'response': 6, + # u'system': 4, + # u'time': 7, + # u'trees': 8, + # u'user': 3} + self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] + # Suppose the segmented topics from s_one_pre are: + self.segmented_topics = [ + [ + (4, 9), + (1, 9), + (1, 4) + ], [ + (1, 9), + (3, 9), + (3, 1) + ] + ] + + def testPBooleanDocument(self): + """Test p_boolean_document()""" + obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) + expected = {9: {5}, 3: {1, 3}, 4: {1, 2}, 1: {0}} + self.assertEqual(expected, obtained) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. - obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2) - expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])} - self.assertTrue(obtained == expected) + obtained, _ = probability_estimation.p_boolean_sliding_window( + self.texts, self.segmented_topics, self.dictionary, 2) + expected = {1: {1}, 3: {8, 2, 3}, 9: {11}, 4: {4, 5, 6, 7}} + self.assertEqual(expected, obtained) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 8922c511a3..c7e3c4d3d2 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -9,28 +9,45 @@ """ import logging -import numpy as np - -from gensim.corpora import Dictionary - from itertools import chain, islice +from collections import defaultdict + +import numpy as np logger = logging.getLogger(__name__) + def _ret_top_ids(segmented_topics): """ Helper function to return a set of all the unique topic ids in segmented topics. """ top_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: - for id in chain.from_iterable(s_i): - if isinstance(id, np.ndarray): - for i in id: + for word_id in chain.from_iterable(s_i): + if isinstance(word_id, np.ndarray): + for i in word_id: top_ids.add(i) else: - top_ids.add(id) + top_ids.add(word_id) + return top_ids + +def _ids_to_words(ids, dictionary): + """Convert an iterable of ids to their corresponding words using a dictionary. + This function abstracts away the differences between the HashDictionary and the standard one. + """ + top_words = set() + for word_id in ids: + word = dictionary[word_id] + if isinstance(word, set): + top_words = top_words.union(word) + else: + top_words.add(word) + + return top_words + + def p_boolean_document(corpus, segmented_topics): """ This function performs the boolean document probability estimation. Boolean document estimates the probability @@ -48,18 +65,65 @@ def p_boolean_document(corpus, segmented_topics): """ top_ids = _ret_top_ids(segmented_topics) # Instantiate the dictionary with empty sets for each top_id - per_topic_postings = {} - for id in top_ids: - per_topic_postings[id] = set() + per_topic_postings = {word_id: set() for word_id in top_ids} + # Iterate through the documents, appending the document number to the set for each top_id it contains for n, document in enumerate(corpus): doc_words = frozenset(x[0] for x in document) top_ids_in_doc = top_ids.intersection(doc_words) if len(top_ids_in_doc) > 0: - for id in top_ids_in_doc: - per_topic_postings[id].add(n) - num_docs = len(corpus) - return (per_topic_postings, num_docs) + for word_id in top_ids_in_doc: + per_topic_postings[word_id].add(n) + + return per_topic_postings, len(corpus) + + +def _iter_windows(texts, window_size): + """Produce a generator over the given texts using a sliding window of `window_size`. + + Args: + ---- + texts: List of string sentences. + window_size: Size of sliding window. + + """ + for document in texts: + it = iter(document) + window = tuple(islice(it, window_size)) + yield window + + for elem in it: + window = window[1:] + (elem,) + yield window + + +class WordOccurrenceAccumulator(object): + """Accumulate word occurrences from a sequence of documents.""" + + def __init__(self, relevant_words): + """ + Args: + ---- + relevant_words: the set of words that occurrences should be accumulated for. + """ + self.relevant_words = set(relevant_words) + self.window_id = 0 # id of next document to be observed + self.word_occurrences = defaultdict(set) # map from words to ids of docs they occur in + + def filter_to_relevant_words(self, doc): + return (word for word in doc if word in self.relevant_words) + + def add_occurrences_from_doc(self, window): + for word in self.filter_to_relevant_words(window): + self.word_occurrences[word].add(self.window_id) + + self.window_id += 1 + + def accumulate(self, texts, window_size): + for virtual_document in _iter_windows(texts, window_size): + self.add_occurrences_from_doc(virtual_document) + return self + def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): """ @@ -81,26 +145,13 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id[0] : Total no of windows """ top_ids = _ret_top_ids(segmented_topics) - window_id = 0 # Each window assigned a window id. - per_topic_postings = {} - token2id_dict = dictionary.token2id - def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict): - for word in window: - word_id = token2id_dict[word] - if word_id in top_ids: - if word_id in per_topic_postings: - per_topic_postings[word_id].add(window_id) - else: - per_topic_postings[word_id] = set([window_id]) - window_id += 1 - return (window_id, per_topic_postings) - # Apply boolean sliding window to each document in texts. - for document in texts: - it = iter(document) - window = tuple(islice(it, window_size)) - window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) - for elem in it: - window = window[1:] + (elem,) - window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) + top_words = _ids_to_words(top_ids, dictionary) + occurrence_accumulator = WordOccurrenceAccumulator(top_words)\ + .accumulate(texts, window_size) + + # Replace words with their ids. + occurrences = occurrence_accumulator.word_occurrences + per_topic_postings = {dictionary.token2id[word]: id_set + for word, id_set in occurrences.iteritems()} - return per_topic_postings, window_id + return per_topic_postings, occurrence_accumulator.window_id From f69a2ffa7fe2b9254c61393d057201fa4a331ed7 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Mon, 22 May 2017 16:14:19 -0400 Subject: [PATCH 02/33] #1342: Do not produce sliding windows for texts with no relevant words, and ensure each relevant word has a set in the `per_topic_postings` dict. --- gensim/test/test_probability_estimation.py | 4 ++-- gensim/topic_coherence/probability_estimation.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py index 09d9ee071f..68ac24e752 100644 --- a/gensim/test/test_probability_estimation.py +++ b/gensim/test/test_probability_estimation.py @@ -65,7 +65,7 @@ def testPBooleanSlidingWindow(self): # Test with window size as 2. window_id is zero indexed. obtained, _ = probability_estimation.p_boolean_sliding_window( self.texts, self.segmented_topics, self.dictionary, 2) - expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {11}, 5798: {4, 5, 6, 7}} + expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {10}, 5798: {4, 5, 6, 7}} self.assertEqual(expected, obtained) @@ -109,7 +109,7 @@ def testPBooleanSlidingWindow(self): # Test with window size as 2. window_id is zero indexed. obtained, _ = probability_estimation.p_boolean_sliding_window( self.texts, self.segmented_topics, self.dictionary, 2) - expected = {1: {1}, 3: {8, 2, 3}, 9: {11}, 4: {4, 5, 6, 7}} + expected = {1: {1}, 3: {8, 2, 3}, 9: {10}, 4: {4, 5, 6, 7}} self.assertEqual(expected, obtained) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index c7e3c4d3d2..ff9c0708bc 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -119,8 +119,15 @@ def add_occurrences_from_doc(self, window): self.window_id += 1 + def text_is_relevant(self, text): + for word in text: + if word in self.relevant_words: + return True + return False + def accumulate(self, texts, window_size): - for virtual_document in _iter_windows(texts, window_size): + relevant_texts = (text for text in texts if self.text_is_relevant(text)) + for virtual_document in _iter_windows(relevant_texts, window_size): self.add_occurrences_from_doc(virtual_document) return self @@ -154,4 +161,9 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): per_topic_postings = {dictionary.token2id[word]: id_set for word, id_set in occurrences.iteritems()} + # Ensure all top ids have a corresponding set, even if it's an empty one. + for word_id in top_ids: + if word_id not in per_topic_postings: + per_topic_postings[word_id] = set() + return per_topic_postings, occurrence_accumulator.window_id From 26de54726b0790be73e2fe70614c13f9a2334f0e Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Mon, 22 May 2017 16:23:16 -0400 Subject: [PATCH 03/33] #1342: Remove unused multiprocessing import in `coherencemodel` module. --- gensim/models/coherencemodel.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 130c285822..4e110681e2 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -19,20 +19,18 @@ """ import logging -import multiprocessing as mp +from collections import namedtuple + +import numpy as np from gensim import interfaces +from gensim.matutils import argsort +from gensim.models.ldamodel import LdaModel +from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet from gensim.topic_coherence import (segmentation, probability_estimation, direct_confirmation_measure, indirect_confirmation_measure, aggregation) -from gensim.matutils import argsort from gensim.utils import is_corpus, FakeDict -from gensim.models.ldamodel import LdaModel -from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet - -import numpy as np - -from collections import namedtuple logger = logging.getLogger(__name__) From dfe159b17dd4e8a79e3cad96d53b4f54079452b4 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 24 May 2017 11:07:24 -0400 Subject: [PATCH 04/33] add utility functions for strided windowing of texts (lists of strings representation of corpus) --- gensim/test/test_utils.py | 68 +++++++++++++++++++++++++++++++++++++-- gensim/utils.py | 50 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 41f20eb232..cbdac0170b 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -13,6 +13,8 @@ from gensim import utils from six import iteritems +import numpy as np + class TestIsCorpus(unittest.TestCase): def test_None(self): @@ -90,8 +92,70 @@ def test_sample_dict(self): self.assertEqual(sampled_dict,expected_dict) sampled_dict_random = utils.sample_dict(d,2) if sampled_dict_random in expected_dict_random: - self.assertTrue(True) - + self.assertTrue(True) + + +class TestWindowing(unittest.TestCase): + + arr10_5 = np.array([ + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + [4, 5, 6, 7, 8], + [5, 6, 7, 8, 9] + ]) + + def _assert_arrays_equal(self, expected, actual): + self.assertEqual(expected.shape, actual.shape) + self.assertTrue((actual == expected).all()) + + def test_strided_windows1(self): + out = utils.strided_windows(range(5), 2) + expected = np.array([ + [0, 1], + [1, 2], + [2, 3], + [3, 4] + ]) + self._assert_arrays_equal(expected, out) + + def test_strided_windows2(self): + input_arr = np.arange(10) + out = utils.strided_windows(input_arr, 5) + expected = self.arr10_5.copy() + self._assert_arrays_equal(expected, out) + out[0, 0] = 10 + self.assertEqual(10, input_arr[0], "should make view rather than copy") + + def test_iter_windows_list_texts(self): + texts = [['this', 'is', 'a'], ['test', 'document']] + windows = list(utils.iter_windows(texts, 2)) + list_windows = [list(iterable) for iterable in windows] + expected = [['this', 'is'], ['is', 'a'], ['test', 'document']] + self.assertListEqual(list_windows, expected) + + def test_iter_windows_uses_views(self): + texts = [np.array(['this', 'is', 'a'], dtype='object'), ['test', 'document']] + windows = list(utils.iter_windows(texts, 2)) + list_windows = [list(iterable) for iterable in windows] + expected = [['this', 'is'], ['is', 'a'], ['test', 'document']] + self.assertListEqual(list_windows, expected) + windows[0][0] = 'modified' + self.assertEqual('modified', texts[0][0]) + + def test_iter_windows_with_copy(self): + texts = [ + np.array(['this', 'is', 'a'], dtype='object'), + np.array(['test', 'document'], dtype='object') + ] + windows = list(utils.iter_windows(texts, 2, copy=True)) + + windows[0][0] = 'modified' + self.assertEqual('this', texts[0][0]) + + windows[2][0] = 'modified' + self.assertEqual('test', texts[1][0]) if __name__ == '__main__': diff --git a/gensim/utils.py b/gensim/utils.py index 8d5fdb7d7f..36d70b1927 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1188,3 +1188,53 @@ def sample_dict(d, n=10, use_random=True): """ selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) return [(key, d[key]) for key in selected_keys] + + +def strided_windows(ndarray, window_size): + """ + Produce a numpy.ndarray of windows, as from a sliding window. + + >>> strided_windows(np.arange(5), 2) + array([[0, 1], + [1, 2], + [2, 3], + [3, 4]]) + >>> strided_windows(np.arange(10), 5) + array([[0, 1, 2, 3, 4], + [1, 2, 3, 4, 5], + [2, 3, 4, 5, 6], + [3, 4, 5, 6, 7], + [4, 5, 6, 7, 8], + [5, 6, 7, 8, 9]]) + + Args: + ---- + ndarray: either a numpy.ndarray or something that can be converted into one. + window_size: sliding window size. + :param window_size: + :return: numpy.ndarray of the subsequences produced by sliding a window of the given size over + the `ndarray`. Since this uses striding, the individual arrays are views rather than + copies of `ndarray`. Changes to one view modifies the others and the original. + """ + ndarray = np.asarray(ndarray) + stride = ndarray.strides[0] + return np.lib.stride_tricks.as_strided( + ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size), + strides=(stride, stride)) + + +def iter_windows(texts, window_size, copy=False): + """Produce a generator over the given texts using a sliding window of `window_size`. + The windows produced are views of some subsequence of a text. To use deep copies + instead, pass `copy=True`. + + Args: + ---- + texts: List of string sentences. + window_size: Size of sliding window. + copy: False to use views of the texts (default) or True to produce deep copies. + + """ + for document in texts: + for doc_window in strided_windows(document, window_size): + yield doc_window.copy() if copy else doc_window From 2e3852ef9974259fd28402591893498af0b8e7c0 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 24 May 2017 15:51:58 -0400 Subject: [PATCH 05/33] handle edge cases with window_size equal to or exceeding document size in strided_windows and iter_windows utiltity functions --- gensim/test/test_text_analysis.py | 0 gensim/test/test_utils.py | 22 ++++++++++++++++++++++ gensim/topic_coherence/text_analysis.py | 0 gensim/utils.py | 18 +++++++++++++++--- 4 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 gensim/test/test_text_analysis.py create mode 100644 gensim/topic_coherence/text_analysis.py diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index cbdac0170b..612d55dd68 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -128,6 +128,28 @@ def test_strided_windows2(self): out[0, 0] = 10 self.assertEqual(10, input_arr[0], "should make view rather than copy") + def test_strided_windows_window_size_exceeds_size(self): + input_arr = np.array(['this', 'is', 'test'], dtype='object') + out = utils.strided_windows(input_arr, 4) + expected = np.ndarray((0, 0)) + self._assert_arrays_equal(expected, out) + + def test_strided_windows_window_size_equals_size(self): + input_arr = np.array(['this', 'is', 'test'], dtype='object') + out = utils.strided_windows(input_arr, 3) + expected = np.array([input_arr.copy()]) + self._assert_arrays_equal(expected, out) + + def test_iter_windows_include_below_window_size(self): + texts = [['this', 'is', 'a'], ['test', 'document']] + out = utils.iter_windows(texts, 3, ignore_below_size=False) + windows = [list(w) for w in out] + self.assertEqual(texts, windows) + + out = utils.iter_windows(texts, 3) + windows = [list(w) for w in out] + self.assertEqual([texts[0]], windows) + def test_iter_windows_list_texts(self): texts = [['this', 'is', 'a'], ['test', 'document']] windows = list(utils.iter_windows(texts, 2)) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gensim/utils.py b/gensim/utils.py index 36d70b1927..3a191f1a9a 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1217,13 +1217,18 @@ def strided_windows(ndarray, window_size): copies of `ndarray`. Changes to one view modifies the others and the original. """ ndarray = np.asarray(ndarray) + if window_size == ndarray.shape[0]: + return np.array([ndarray]) + elif window_size > ndarray.shape[0]: + return np.ndarray((0, 0)) + stride = ndarray.strides[0] return np.lib.stride_tricks.as_strided( ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size), strides=(stride, stride)) -def iter_windows(texts, window_size, copy=False): +def iter_windows(texts, window_size, copy=False, ignore_below_size=True): """Produce a generator over the given texts using a sliding window of `window_size`. The windows produced are views of some subsequence of a text. To use deep copies instead, pass `copy=True`. @@ -1233,8 +1238,15 @@ def iter_windows(texts, window_size, copy=False): texts: List of string sentences. window_size: Size of sliding window. copy: False to use views of the texts (default) or True to produce deep copies. + ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). + If False, the documents below `window_size` will be yielded as the full document. """ for document in texts: - for doc_window in strided_windows(document, window_size): - yield doc_window.copy() if copy else doc_window + doc_windows = strided_windows(document, window_size) + if doc_windows.shape[0] == 0: + if not ignore_below_size: + yield document.copy() if copy else document + else: + for doc_window in doc_windows: + yield doc_window.copy() if copy else doc_window From ec7af1bd89d99210599b183d0a3b018dd7cde7c9 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 24 May 2017 16:00:07 -0400 Subject: [PATCH 06/33] move code for building inverted index into a new text_analysis module and add initial impl of accumulator that directly tracks term occurrence and co-occurrence counts --- gensim/test/test_text_analysis.py | 84 +++++++++++ .../topic_coherence/probability_estimation.py | 74 +--------- gensim/topic_coherence/text_analysis.py | 132 ++++++++++++++++++ 3 files changed, 222 insertions(+), 68 deletions(-) diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index e69de29bb2..d7b4695ac6 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -0,0 +1,84 @@ +import logging +import unittest + +from gensim.topic_coherence.text_analysis import \ + InvertedIndexAccumulator, WordOccurrenceAccumulator + + +class BaseTestCases(object): + + class TextAnalyzerTestBase(unittest.TestCase): + texts = [ + ['this', 'is', 'a'], + ['test', 'document'], + ['this', 'test', 'document'] + ] + token2id = { + 'this': 10, + 'is': 15, + 'a': 20, + 'test': 21, + 'document': 17 + } + top_words = token2id.keys() + + accumulator_cls = None + + def test_occurrence_counting(self): + accumulator = self.accumulator_cls(self.top_words, self.token2id) \ + .accumulate(self.texts, 3) + self.assertEqual(2, accumulator.get_occurrences("this")) + self.assertEqual(1, accumulator.get_occurrences("is")) + self.assertEqual(1, accumulator.get_occurrences("a")) + + self.assertEqual(2, accumulator.get_co_occurrences("test", "document")) + self.assertEqual(1, accumulator.get_co_occurrences("is", "a")) + + def test_occurences_for_irrelevant_words(self): + accumulator = WordOccurrenceAccumulator(self.top_words, self.token2id) \ + .accumulate(self.texts, 2) + with self.assertRaises(KeyError): + accumulator.get_occurrences("irrelevant") + with self.assertRaises(KeyError): + accumulator.get_co_occurrences("test", "irrelevant") + + +class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase): + accumulator_cls = InvertedIndexAccumulator + + def test_accumulate1(self): + accumulator = InvertedIndexAccumulator(self.top_words, self.token2id)\ + .accumulate(self.texts, 2) + # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], ['test', 'document']] + inverted_index = accumulator.index_to_dict() + expected = { + 10: {0, 3}, + 15: {0, 1}, + 20: {1}, + 21: {2, 3, 4}, + 17: {2, 4} + } + self.assertDictEqual(expected, inverted_index) + + def test_accumulate2(self): + accumulator = InvertedIndexAccumulator(self.top_words, self.token2id) \ + .accumulate(self.texts, 3) + # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document']] + inverted_index = accumulator.index_to_dict() + expected = { + 10: {0, 2}, + 15: {0}, + 20: {0}, + 21: {1, 2}, + 17: {1, 2} + } + self.assertDictEqual(expected, inverted_index) + + +class TestWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase): + accumulator_cls = WordOccurrenceAccumulator + + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index ff9c0708bc..c7f5ca4dca 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -9,11 +9,12 @@ """ import logging -from itertools import chain, islice -from collections import defaultdict +import itertools import numpy as np +from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator + logger = logging.getLogger(__name__) @@ -23,7 +24,7 @@ def _ret_top_ids(segmented_topics): """ top_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: - for word_id in chain.from_iterable(s_i): + for word_id in itertools.chain.from_iterable(s_i): if isinstance(word_id, np.ndarray): for i in word_id: top_ids.add(i) @@ -78,60 +79,6 @@ def p_boolean_document(corpus, segmented_topics): return per_topic_postings, len(corpus) -def _iter_windows(texts, window_size): - """Produce a generator over the given texts using a sliding window of `window_size`. - - Args: - ---- - texts: List of string sentences. - window_size: Size of sliding window. - - """ - for document in texts: - it = iter(document) - window = tuple(islice(it, window_size)) - yield window - - for elem in it: - window = window[1:] + (elem,) - yield window - - -class WordOccurrenceAccumulator(object): - """Accumulate word occurrences from a sequence of documents.""" - - def __init__(self, relevant_words): - """ - Args: - ---- - relevant_words: the set of words that occurrences should be accumulated for. - """ - self.relevant_words = set(relevant_words) - self.window_id = 0 # id of next document to be observed - self.word_occurrences = defaultdict(set) # map from words to ids of docs they occur in - - def filter_to_relevant_words(self, doc): - return (word for word in doc if word in self.relevant_words) - - def add_occurrences_from_doc(self, window): - for word in self.filter_to_relevant_words(window): - self.word_occurrences[word].add(self.window_id) - - self.window_id += 1 - - def text_is_relevant(self, text): - for word in text: - if word in self.relevant_words: - return True - return False - - def accumulate(self, texts, window_size): - relevant_texts = (text for text in texts if self.text_is_relevant(text)) - for virtual_document in _iter_windows(relevant_texts, window_size): - self.add_occurrences_from_doc(virtual_document) - return self - - def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): """ This function performs the boolean sliding window probability estimation. Boolean sliding window @@ -153,17 +100,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): """ top_ids = _ret_top_ids(segmented_topics) top_words = _ids_to_words(top_ids, dictionary) - occurrence_accumulator = WordOccurrenceAccumulator(top_words)\ + occurrence_accumulator = InvertedIndexAccumulator(top_words, dictionary.token2id)\ .accumulate(texts, window_size) - # Replace words with their ids. - occurrences = occurrence_accumulator.word_occurrences - per_topic_postings = {dictionary.token2id[word]: id_set - for word, id_set in occurrences.iteritems()} - - # Ensure all top ids have a corresponding set, even if it's an empty one. - for word_id in top_ids: - if word_id not in per_topic_postings: - per_topic_postings[word_id] = set() - + per_topic_postings = occurrence_accumulator.index_to_dict() return per_topic_postings, occurrence_accumulator.window_id diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index e69de29bb2..03baec13d3 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module contains classes for analyzing the texts of a corpus to accumulate +statistical information about word occurrences. +""" + +import itertools + +import numpy as np +import scipy.sparse as sps + +from gensim import utils + + +class TextsAnalyzer(object): + """Gather some statistics about relevant terms a corpus by iterating over texts.""" + + def __init__(self, relevant_words, token2id): + """ + Args: + ---- + relevant_words: the set of words that occurrences should be accumulated for. + """ + self.relevant_words = set(relevant_words) + self.relevant_ids = set(token2id[word] for word in self.relevant_words) + self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} + self.token2id = token2id + + def filter_to_relevant_words(self, text): + """Lazily filter the text to only those words which are relevant.""" + relevant_words = (word for word in text if word in self.relevant_words) + relevant_ids = (self.token2id[word] for word in relevant_words) + return (self.id2contiguous[word_id] for word_id in relevant_ids) + + def text_is_relevant(self, text): + """Return True if the text has any relevant words, else False.""" + for word in text: + if word in self.relevant_words: + return True + return False + + def analyze_text(self, text): + raise NotImplementedError("Base classes should implement analyze_text.") + + def accumulate(self, texts, window_size): + relevant_texts = (text for text in texts if self.text_is_relevant(text)) + for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False): + self.analyze_text(virtual_document) + return self + + def get_occurrences(self, word): + """Return number of docs the word occurs in, once `accumulate` has been called.""" + word_id = self.token2id[word] + return self._get_occurrences(self.id2contiguous[word_id]) + + def _get_occurrences(self, word_id): + raise NotImplementedError("Base classes should implement occurrences") + + def get_co_occurrences(self, word1, word2): + """Return number of docs the words co-occur in, once `accumulate` has been called.""" + word_id1 = self.token2id[word1] + word_id2 = self.token2id[word2] + return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2]) + + def _get_co_occurrences(self, word_id1, word_id2): + raise NotImplementedError("Base classes should implement co_occurrences") + + +class InvertedIndexAccumulator(TextsAnalyzer): + """Build an inverted index from a sequence of corpus texts.""" + + def __init__(self, *args): + super(InvertedIndexAccumulator, self).__init__(*args) + self.window_id = 0 # id of next document to be observed + vocab_size = len(self.relevant_words) + self._inverted_index = np.array([set() for _ in range(vocab_size)]) + + def analyze_text(self, window): + for word_id in self.filter_to_relevant_words(window): + self._inverted_index[word_id].add(self.window_id) + + self.window_id += 1 + + def index_to_dict(self): + contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()} + return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} + + def _get_occurrences(self, word_id): + return len(self._inverted_index[word_id]) + + def _get_co_occurrences(self, word_id1, word_id2): + s1 = self._inverted_index[word_id1] + s2 = self._inverted_index[word_id2] + return len(s1.intersection(s2)) + + +class WordOccurrenceAccumulator(TextsAnalyzer): + """Accumulate word occurrences and co-occurrences from a corpus of texts.""" + + def __init__(self, *args): + super(WordOccurrenceAccumulator, self).__init__(*args) + vocab_size = len(self.relevant_words) + self._occurrences = np.zeros(vocab_size, dtype='uint32') + self._co_occurrences = sps.lil_matrix((vocab_size, vocab_size), dtype='uint32') + + def analyze_text(self, window): + relevant_words = list(self.filter_to_relevant_words(window)) + uniq_words = np.array(relevant_words) + self._occurrences[uniq_words] += 1 + + for combo in itertools.combinations(relevant_words, 2): + self._co_occurrences[combo] += 1 + + def _symmetrize(self): + co_occ = self._co_occurrences + return co_occ + co_occ.T - np.diag(co_occ.diagonal()) + + def accumulate(self, texts, window_size): + super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) + self._symmetrize() + return self + + def _get_occurrences(self, word_id): + return self._occurrences[word_id] + + def _get_co_occurrences(self, word_id1, word_id2): + return self._co_occurrences[word_id1, word_id2] From 3f8fb7f52c788d135fbd4da809c97677c85bceb9 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 24 May 2017 18:40:04 -0400 Subject: [PATCH 07/33] complete migration to using the accumulators in the new text_analysis package for all confirmation measures in the CoherenceModel pipeline --- gensim/models/coherencemodel.py | 19 +- gensim/test/test_coherencemodel.py | 7 +- gensim/test/test_direct_confirmation.py | 19 +- gensim/test/test_text_analysis.py | 13 +- .../direct_confirmation_measure.py | 41 ++-- .../indirect_confirmation_measure.py | 17 +- .../topic_coherence/probability_estimation.py | 37 +--- gensim/topic_coherence/text_analysis.py | 176 +++++++++++++----- 8 files changed, 202 insertions(+), 127 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 4e110681e2..9888dcb3d6 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -63,6 +63,7 @@ 'c_npmi': 10 } + class CoherenceModel(interfaces.TransformationABC): """ Objects of this class allow for building and maintaining a model for topic @@ -143,6 +144,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.dictionary = dictionary # Check for correct inputs for u_mass coherence measure. + self.coherence = coherence if coherence in boolean_document_based: if is_corpus(corpus)[0]: self.corpus = corpus @@ -155,6 +157,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= # Check for correct inputs for c_v coherence measure. elif coherence in sliding_window_based: self.window_size = window_size + if self.window_size is None: + self.window_size = sliding_windows_dict[self.coherence] if texts is None: raise ValueError("'texts' should be provided for %s coherence." % coherence) else: @@ -173,7 +177,6 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= for n, _ in enumerate(topic): t_i.append(dictionary.token2id[topic[n]]) self.topics.append(np.array(t_i)) - self.coherence = coherence def __str__(self): return coherence_dict[self.coherence].__str__() @@ -203,18 +206,16 @@ def get_coherence_per_topic(self): segmented_topics = measure.seg(self.topics) if self.coherence in boolean_document_based: - per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics) - return measure.conf(segmented_topics, per_topic_postings, num_docs) + accumulator = measure.prob(self.corpus, segmented_topics) + return measure.conf(segmented_topics, accumulator) - if self.window_size is not None: - self.window_size = sliding_windows_dict[self.coherence] - per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) + accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=self.window_size) if self.coherence == 'c_v': - return measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) + return measure.conf(self.topics, segmented_topics, accumulator, 'nlr', 1) else: normalize = self.coherence == 'c_npmi' - return measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) + return measure.conf(segmented_topics, accumulator, normalize=normalize) def aggregate_measures(self, confirmed_measures): measure = coherence_dict[self.coherence] diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 3961f67180..d69aaf0dad 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -14,7 +14,7 @@ import os.path import tempfile -from gensim.models.coherencemodel import CoherenceModel +from gensim.models.coherencemodel import CoherenceModel, boolean_document_based from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaMallet from gensim.models.wrappers import LdaVowpalWabbit @@ -35,14 +35,13 @@ ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] -boolean_document_based = ['u_mass'] -sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') + def checkCoherenceMeasure(topics1, topics2, coherence): """Check provided topic coherence algorithm on given topics""" if coherence in boolean_document_based: @@ -53,6 +52,7 @@ def checkCoherenceMeasure(topics1, topics2, coherence): cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence) return cm1.get_coherence() > cm2.get_coherence() + class TestCoherenceModel(unittest.TestCase): def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. @@ -219,6 +219,7 @@ def testPersistenceCompressed(self): model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py index cb35f0acc4..ad39b99b62 100644 --- a/gensim/test/test_direct_confirmation.py +++ b/gensim/test/test_direct_confirmation.py @@ -10,38 +10,49 @@ import logging import unittest +from collections import namedtuple from gensim.topic_coherence import direct_confirmation_measure +from gensim.topic_coherence import text_analysis + class TestDirectConfirmationMeasure(unittest.TestCase): def setUp(self): # Set up toy example for better understanding and testing # of this module. See the modules for the mathematical formulas self.segmentation = [[(1, 2)]] - self.posting_list = {1: set([2, 3, 4]), 2: set([3, 5])} + self.posting_list = {1: {2, 3, 4}, 2: {3, 5}} self.num_docs = 5 + id2token = {1: 'test', 2: 'doc'} + token2id = {v: k for k, v in id2token.items()} + dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + self.accumulator._num_docs = self.num_docs + def testLogConditionalProbability(self): """Test log_conditional_probability()""" - obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.posting_list, self.num_docs)[0] + obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.accumulator)[0] # Answer should be ~ ln(1 / 2) = -0.693147181 expected = -0.693147181 self.assertAlmostEqual(obtained, expected) def testLogRatioMeasure(self): """Test log_ratio_measure()""" - obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0] + obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator)[0] # Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 expected = -0.182321557 self.assertAlmostEqual(obtained, expected) def testNormalizedLogRatioMeasure(self): """Test normalized_log_ratio_measure()""" - obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs, normalize=True)[0] + obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator, normalize=True)[0] # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753 expected = -0.113282753 self.assertAlmostEqual(obtained, expected) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index d7b4695ac6..27eecbc645 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -1,5 +1,6 @@ import logging import unittest +from collections import namedtuple from gensim.topic_coherence.text_analysis import \ InvertedIndexAccumulator, WordOccurrenceAccumulator @@ -20,12 +21,14 @@ class TextAnalyzerTestBase(unittest.TestCase): 'test': 21, 'document': 17 } - top_words = token2id.keys() + id2token = {v: k for k, v in token2id.items()} + dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + top_ids = set(token2id.values()) accumulator_cls = None def test_occurrence_counting(self): - accumulator = self.accumulator_cls(self.top_words, self.token2id) \ + accumulator = self.accumulator_cls(self.top_ids, self.dictionary) \ .accumulate(self.texts, 3) self.assertEqual(2, accumulator.get_occurrences("this")) self.assertEqual(1, accumulator.get_occurrences("is")) @@ -35,7 +38,7 @@ def test_occurrence_counting(self): self.assertEqual(1, accumulator.get_co_occurrences("is", "a")) def test_occurences_for_irrelevant_words(self): - accumulator = WordOccurrenceAccumulator(self.top_words, self.token2id) \ + accumulator = WordOccurrenceAccumulator(self.top_ids, self.dictionary) \ .accumulate(self.texts, 2) with self.assertRaises(KeyError): accumulator.get_occurrences("irrelevant") @@ -47,7 +50,7 @@ class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase): accumulator_cls = InvertedIndexAccumulator def test_accumulate1(self): - accumulator = InvertedIndexAccumulator(self.top_words, self.token2id)\ + accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\ .accumulate(self.texts, 2) # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], ['test', 'document']] inverted_index = accumulator.index_to_dict() @@ -61,7 +64,7 @@ def test_accumulate1(self): self.assertDictEqual(expected, inverted_index) def test_accumulate2(self): - accumulator = InvertedIndexAccumulator(self.top_words, self.token2id) \ + accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary) \ .accumulate(self.texts, 3) # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document']] inverted_index = accumulator.index_to_dict() diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index f50fb612e2..60631375ef 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -15,7 +15,8 @@ EPSILON = 1e-12 # Should be small. Value as suggested in paper. -def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): + +def log_conditional_probability(segmented_topics, accumulator): """ This function calculates the log-conditional-probability measure which is used by coherence measures such as U_mass. @@ -24,28 +25,29 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): Args: ---- segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. - num_docs : Total number of documents in corresponding corpus. + accumulator: word occurrence accumulator from probability_estimation. Returns: ------- m_lc : List of log conditional probability measure on each set in segmented topics. """ m_lc = [] + num_docs = float(accumulator.num_docs) for s_i in segmented_topics: for w_prime, w_star in s_i: - w_prime_docs = per_topic_postings[w_prime] - w_star_docs = per_topic_postings[w_star] - co_docs = w_prime_docs.intersection(w_star_docs) - if w_star_docs: - m_lc_i = np.log(((len(co_docs) / float(num_docs)) + EPSILON) / (len(w_star_docs) / float(num_docs))) - else: + try: + w_star_count = accumulator[w_star] + co_occur_count = accumulator[w_prime, w_star] + m_lc_i = np.log(((co_occur_count / num_docs) + EPSILON) / (w_star_count / num_docs)) + except KeyError: m_lc_i = 0.0 + m_lc.append(m_lc_i) return m_lc -def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False): + +def log_ratio_measure(segmented_topics, accumulator, normalize=False): """ If normalize=False: Popularly known as PMI. @@ -61,28 +63,29 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize= Args: ---- segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics - num_docs : Total number of documents in corpus. Used for calculating probability. + accumulator: word occurrence accumulator from probability_estimation. Returns: ------- m_lr : List of log ratio measures on each set in segmented topics. """ m_lr = [] + num_docs = float(accumulator.num_docs) for s_i in segmented_topics: for w_prime, w_star in s_i: - w_prime_docs = per_topic_postings[w_prime] - w_star_docs = per_topic_postings[w_star] - co_docs = w_prime_docs.intersection(w_star_docs) + w_prime_count = accumulator[w_prime] + w_star_count = accumulator[w_star] + co_occur_count = accumulator[w_prime, w_star] + if normalize: # For normalized log ratio measure - numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] - co_doc_prob = len(co_docs) / float(num_docs) + numerator = log_ratio_measure([[(w_prime, w_star)]], accumulator)[0] + co_doc_prob = co_occur_count / num_docs m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON)) else: # For log ratio measure without normalization - numerator = (len(co_docs) / float(num_docs)) + EPSILON - denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) + numerator = (co_occur_count / num_docs) + EPSILON + denominator = (w_prime_count / num_docs) * (w_star_count / num_docs) m_lr_i = np.log(numerator / denominator) m_lr.append(m_lr_i) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index c68206a372..c4585ad677 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -48,7 +48,8 @@ def _present(w_prime_star, w, w_backtrack): return -1 return index -def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs): + +def _make_seg(w_prime, w, accumulator, measure, gamma, backtrack): """ Internal helper function to return context vectors for segmentations. """ @@ -57,7 +58,7 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc for w_j in w: for w_i in w_prime: if (w_i, w_j) not in backtrack: - backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0] + backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], accumulator, measure[1])[0] if w_j not in context_vectors: context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma else: @@ -65,11 +66,13 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc else: for w_j in w: if (w_prime, w_j) not in backtrack: - backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0] + backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], accumulator, measure[1])[0] context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma - return (context_vectors, backtrack) -def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): + return context_vectors, backtrack + + +def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma): """ This function calculates the indirect cosine measure. Given context vectors _ _ _ _ @@ -116,7 +119,7 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam if w_backtrack and w_prime_index != -1: w_prime_context_vectors = context_vector_backtrack[w_prime_index] else: - w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, accumulator, measure, gamma, backtrack) backtrack.update(backtrack_i) # Update backtracking lists w_backtrack.append((w_prime, top_words)) @@ -128,7 +131,7 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam if w_backtrack and w_star_index != -1: w_star_context_vectors = context_vector_backtrack[w_star_index] else: - w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, accumulator, measure, gamma, backtrack) backtrack.update(backtrack_i) # Update all backtracking lists w_backtrack.append((w_star, top_words)) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index c7f5ca4dca..d9982ca409 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -13,7 +13,7 @@ import numpy as np -from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator +from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator, CorpusAnalyzer logger = logging.getLogger(__name__) @@ -34,21 +34,6 @@ def _ret_top_ids(segmented_topics): return top_ids -def _ids_to_words(ids, dictionary): - """Convert an iterable of ids to their corresponding words using a dictionary. - This function abstracts away the differences between the HashDictionary and the standard one. - """ - top_words = set() - for word_id in ids: - word = dictionary[word_id] - if isinstance(word, set): - top_words = top_words.union(word) - else: - top_words.add(word) - - return top_words - - def p_boolean_document(corpus, segmented_topics): """ This function performs the boolean document probability estimation. Boolean document estimates the probability @@ -65,18 +50,8 @@ def p_boolean_document(corpus, segmented_topics): num_docs : Total number of documents in corpus. """ top_ids = _ret_top_ids(segmented_topics) - # Instantiate the dictionary with empty sets for each top_id - per_topic_postings = {word_id: set() for word_id in top_ids} - - # Iterate through the documents, appending the document number to the set for each top_id it contains - for n, document in enumerate(corpus): - doc_words = frozenset(x[0] for x in document) - top_ids_in_doc = top_ids.intersection(doc_words) - if len(top_ids_in_doc) > 0: - for word_id in top_ids_in_doc: - per_topic_postings[word_id].add(n) - - return per_topic_postings, len(corpus) + accumulator = CorpusAnalyzer(top_ids).accumulate(corpus) + return accumulator def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): @@ -99,9 +74,5 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id[0] : Total no of windows """ top_ids = _ret_top_ids(segmented_topics) - top_words = _ids_to_words(top_ids, dictionary) - occurrence_accumulator = InvertedIndexAccumulator(top_words, dictionary.token2id)\ + return InvertedIndexAccumulator(top_ids, dictionary)\ .accumulate(texts, window_size) - - per_topic_postings = occurrence_accumulator.index_to_dict() - return per_topic_postings, occurrence_accumulator.window_id diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 03baec13d3..f175cbe21a 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -17,78 +17,99 @@ from gensim import utils -class TextsAnalyzer(object): - """Gather some statistics about relevant terms a corpus by iterating over texts.""" +def _ids_to_words(ids, dictionary): + """Convert an iterable of ids to their corresponding words using a dictionary. + This function abstracts away the differences between the HashDictionary and the standard one. + """ + if not dictionary.id2token: + setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) - def __init__(self, relevant_words, token2id): - """ - Args: - ---- - relevant_words: the set of words that occurrences should be accumulated for. - """ - self.relevant_words = set(relevant_words) - self.relevant_ids = set(token2id[word] for word in self.relevant_words) - self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} - self.token2id = token2id + top_words = set() + for word_id in ids: + word = dictionary.id2token[word_id] + if isinstance(word, set): + top_words = top_words.union(word) + else: + top_words.add(word) - def filter_to_relevant_words(self, text): - """Lazily filter the text to only those words which are relevant.""" - relevant_words = (word for word in text if word in self.relevant_words) - relevant_ids = (self.token2id[word] for word in relevant_words) - return (self.id2contiguous[word_id] for word_id in relevant_ids) + return top_words - def text_is_relevant(self, text): - """Return True if the text has any relevant words, else False.""" - for word in text: - if word in self.relevant_words: - return True - return False + +class BaseAnalyzer(object): + """Base class for corpus and text analyzers.""" + + def __init__(self, relevant_ids): + self.relevant_ids = relevant_ids + self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} + self._num_docs = 0 + + @property + def num_docs(self): + return self._num_docs def analyze_text(self, text): raise NotImplementedError("Base classes should implement analyze_text.") - def accumulate(self, texts, window_size): - relevant_texts = (text for text in texts if self.text_is_relevant(text)) - for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False): - self.analyze_text(virtual_document) - return self + def __getitem__(self, word_or_words): + if hasattr(word_or_words, '__iter__'): + return self.get_co_occurrences(*word_or_words) + else: + return self.get_occurrences(word_or_words) - def get_occurrences(self, word): + def get_occurrences(self, word_id): """Return number of docs the word occurs in, once `accumulate` has been called.""" - word_id = self.token2id[word] return self._get_occurrences(self.id2contiguous[word_id]) def _get_occurrences(self, word_id): raise NotImplementedError("Base classes should implement occurrences") - def get_co_occurrences(self, word1, word2): + def get_co_occurrences(self, word_id1, word_id2): """Return number of docs the words co-occur in, once `accumulate` has been called.""" - word_id1 = self.token2id[word1] - word_id2 = self.token2id[word2] return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2]) def _get_co_occurrences(self, word_id1, word_id2): raise NotImplementedError("Base classes should implement co_occurrences") -class InvertedIndexAccumulator(TextsAnalyzer): - """Build an inverted index from a sequence of corpus texts.""" +class UsesDictionary(BaseAnalyzer): + """Base class for corpus and text analyzers.""" - def __init__(self, *args): - super(InvertedIndexAccumulator, self).__init__(*args) - self.window_id = 0 # id of next document to be observed - vocab_size = len(self.relevant_words) - self._inverted_index = np.array([set() for _ in range(vocab_size)]) + def __init__(self, relevant_ids, dictionary): + super(UsesDictionary, self).__init__(relevant_ids) + self.relevant_words = _ids_to_words(self.relevant_ids, dictionary) + self.token2id = dictionary.token2id - def analyze_text(self, window): - for word_id in self.filter_to_relevant_words(window): - self._inverted_index[word_id].add(self.window_id) + def analyze_text(self, text): + raise NotImplementedError("Base classes should implement analyze_text.") - self.window_id += 1 + def get_occurrences(self, word): + """Return number of docs the word occurs in, once `accumulate` has been called.""" + try: + word_id = self.token2id[word] + except KeyError: + word_id = word + return self._get_occurrences(self.id2contiguous[word_id]) + + def get_co_occurrences(self, word1, word2): + """Return number of docs the words co-occur in, once `accumulate` has been called.""" + try: + word_id1 = self.token2id[word1] + except KeyError: + word_id1 = word1 + try: + word_id2 = self.token2id[word2] + except KeyError: + word_id2 = word2 + return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2]) - def index_to_dict(self): - contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()} - return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} + +class InvertedIndexBased(BaseAnalyzer): + """Analyzer that builds up an inverted index to accumulate stats.""" + + def __init__(self, *args): + super(InvertedIndexBased, self).__init__(*args) + vocab_size = len(self.relevant_ids) + self._inverted_index = np.array([set() for _ in range(vocab_size)]) def _get_occurrences(self, word_id): return len(self._inverted_index[word_id]) @@ -98,6 +119,67 @@ def _get_co_occurrences(self, word_id1, word_id2): s2 = self._inverted_index[word_id2] return len(s1.intersection(s2)) + def index_to_dict(self): + contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()} + return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} + + +class CorpusAnalyzer(InvertedIndexBased): + """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" + + def analyze_text(self, text): + doc_words = frozenset(x[0] for x in text) + top_ids_in_doc = self.relevant_ids.intersection(doc_words) + if len(top_ids_in_doc) > 0: + for word_id in top_ids_in_doc: + self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs) + + def accumulate(self, corpus): + for document in corpus: + self.analyze_text(document) + self._num_docs += 1 + return self + + +class TextsAnalyzer(UsesDictionary): + """Gather some statistics about relevant terms a corpus by iterating over texts.""" + + def __init__(self, relevant_ids, dictionary): + """ + Args: + ---- + relevant_words: the set of words that occurrences should be accumulated for. + """ + super(TextsAnalyzer, self).__init__(relevant_ids, dictionary) + + def filter_to_relevant_words(self, text): + """Lazily filter the text to only those words which are relevant.""" + relevant_words = (word for word in text if word in self.relevant_words) + relevant_ids = (self.token2id[word] for word in relevant_words) + return (self.id2contiguous[word_id] for word_id in relevant_ids) + + def text_is_relevant(self, text): + """Return True if the text has any relevant words, else False.""" + for word in text: + if word in self.relevant_words: + return True + return False + + def accumulate(self, texts, window_size): + relevant_texts = (text for text in texts if self.text_is_relevant(text)) + for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False): + self.analyze_text(virtual_document) + self._num_docs += 1 + return self + + +class InvertedIndexAccumulator(TextsAnalyzer, InvertedIndexBased): + """Build an inverted index from a sequence of corpus texts.""" + + def analyze_text(self, window): + for word_id in self.filter_to_relevant_words(window): + self._inverted_index[word_id].add(self._num_docs) + class WordOccurrenceAccumulator(TextsAnalyzer): """Accumulate word occurrences and co-occurrences from a corpus of texts.""" From b12edefb26fd02cefdfd0863ea53c7104cd510f8 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 25 May 2017 11:02:13 -0400 Subject: [PATCH 08/33] fix bug in WordOccurrenceAccumulator so that co-occurrences of same word are interpreted as the occurrence; update tests to cover this case; change the p_boolean_sliding_window to use the WordOccurrenceAccumulator; minor cleanup in test_coherencemodel --- gensim/models/coherencemodel.py | 14 +++--- gensim/test/test_coherencemodel.py | 31 ++++++------ gensim/test/test_text_analysis.py | 43 +++++++++++++++++ .../topic_coherence/probability_estimation.py | 6 +-- gensim/topic_coherence/text_analysis.py | 48 ++++++++++++------- 5 files changed, 101 insertions(+), 41 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 9888dcb3d6..80e3b380d9 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -178,6 +178,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= t_i.append(dictionary.token2id[topic[n]]) self.topics.append(np.array(t_i)) + self._accumulator = None + def __str__(self): return coherence_dict[self.coherence].__str__() @@ -206,16 +208,16 @@ def get_coherence_per_topic(self): segmented_topics = measure.seg(self.topics) if self.coherence in boolean_document_based: - accumulator = measure.prob(self.corpus, segmented_topics) - return measure.conf(segmented_topics, accumulator) + self._accumulator = measure.prob(self.corpus, segmented_topics) + return measure.conf(segmented_topics, self._accumulator) - accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) + self._accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=self.window_size) if self.coherence == 'c_v': - return measure.conf(self.topics, segmented_topics, accumulator, 'nlr', 1) + return measure.conf(self.topics, segmented_topics, self._accumulator, 'nlr', 1) else: normalize = self.coherence == 'c_npmi' - return measure.conf(segmented_topics, accumulator, normalize=normalize) + return measure.conf(segmented_topics, self._accumulator, normalize=normalize) def aggregate_measures(self, confirmed_measures): measure = coherence_dict[self.coherence] diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index d69aaf0dad..679f115f5b 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -42,17 +42,6 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -def checkCoherenceMeasure(topics1, topics2, coherence): - """Check provided topic coherence algorithm on given topics""" - if coherence in boolean_document_based: - cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence) - cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence) - else: - cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence) - cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence) - return cm1.get_coherence() > cm2.get_coherence() - - class TestCoherenceModel(unittest.TestCase): def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. @@ -77,21 +66,33 @@ def setUp(self): self.vw_path = vw_path self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0) + def check_coherence_measure(self, coherence): + """Check provided topic coherence algorithm on given topics""" + if coherence in boolean_document_based: + kwargs = dict(corpus=corpus, dictionary=dictionary, coherence=coherence) + cm1 = CoherenceModel(topics=self.topics1, **kwargs) + cm2 = CoherenceModel(topics=self.topics2, **kwargs) + else: + kwargs = dict(texts=texts, dictionary=dictionary, coherence=coherence) + cm1 = CoherenceModel(topics=self.topics1, **kwargs) + cm2 = CoherenceModel(topics=self.topics2, **kwargs) + self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) + def testUMass(self): """Test U_Mass topic coherence algorithm on given topics""" - self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'u_mass')) + self.check_coherence_measure('u_mass') def testCv(self): """Test C_v topic coherence algorithm on given topics""" - self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_v')) + self.check_coherence_measure('c_v') def testCuci(self): """Test C_uci topic coherence algorithm on given topics""" - self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_uci')) + self.check_coherence_measure('c_uci') def testCnpmi(self): """Test C_npmi topic coherence algorithm on given topics""" - self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_npmi')) + self.check_coherence_measure('c_npmi') def testUMassLdaModel(self): """Perform sanity check to see if u_mass coherence works with LDA Model""" diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index 27eecbc645..33a269f9d2 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -4,6 +4,7 @@ from gensim.topic_coherence.text_analysis import \ InvertedIndexAccumulator, WordOccurrenceAccumulator +from gensim.corpora.dictionary import Dictionary class BaseTestCases(object): @@ -25,6 +26,20 @@ class TextAnalyzerTestBase(unittest.TestCase): dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) top_ids = set(token2id.values()) + texts2 = [['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ['user', 'user']] + dictionary2 = Dictionary(texts2) + dictionary2.id2token = {v: k for k, v in dictionary2.token2id.items()} + top_ids2 = set(dictionary2.token2id.values()) + accumulator_cls = None def test_occurrence_counting(self): @@ -37,6 +52,34 @@ def test_occurrence_counting(self): self.assertEqual(2, accumulator.get_co_occurrences("test", "document")) self.assertEqual(1, accumulator.get_co_occurrences("is", "a")) + def test_occurrence_counting2(self): + accumulator = self.accumulator_cls(self.top_ids2, self.dictionary2) \ + .accumulate(self.texts2, 110) + self.assertEqual(2, accumulator.get_occurrences("human")) + self.assertEqual(4, accumulator.get_occurrences("user")) + self.assertEqual(3, accumulator.get_occurrences("graph")) + self.assertEqual(3, accumulator.get_occurrences("trees")) + + cases = [ + (1, ("human", "interface")), + (2, ("system", "user")), + (2, ("graph", "minors")), + (2, ("graph", "trees")), + (4, ("user", "user")), + (3, ("graph", "graph")), + (0, ("time", "eps")) + ] + for expected_count, (word1, word2) in cases: + # Verify co-occurrence counts are correct, regardless of word order. + self.assertEqual(expected_count, accumulator.get_co_occurrences(word1, word2)) + self.assertEqual(expected_count, accumulator.get_co_occurrences(word2, word1)) + + # Also verify that using token ids instead of tokens works the same. + word_id1 = self.dictionary2.token2id[word1] + word_id2 = self.dictionary2.token2id[word2] + self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id1, word_id2)) + self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1)) + def test_occurences_for_irrelevant_words(self): accumulator = WordOccurrenceAccumulator(self.top_ids, self.dictionary) \ .accumulate(self.texts, 2) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index d9982ca409..f406e5a3e7 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -13,7 +13,7 @@ import numpy as np -from gensim.topic_coherence.text_analysis import InvertedIndexAccumulator, CorpusAnalyzer +from gensim.topic_coherence.text_analysis import CorpusAccumulator, WordOccurrenceAccumulator logger = logging.getLogger(__name__) @@ -50,7 +50,7 @@ def p_boolean_document(corpus, segmented_topics): num_docs : Total number of documents in corpus. """ top_ids = _ret_top_ids(segmented_topics) - accumulator = CorpusAnalyzer(top_ids).accumulate(corpus) + accumulator = CorpusAccumulator(top_ids).accumulate(corpus) return accumulator @@ -74,5 +74,5 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id[0] : Total no of windows """ top_ids = _ret_top_ids(segmented_topics) - return InvertedIndexAccumulator(top_ids, dictionary)\ + return WordOccurrenceAccumulator(top_ids, dictionary)\ .accumulate(texts, window_size) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index f175cbe21a..a7ab9b815b 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -20,8 +20,13 @@ def _ids_to_words(ids, dictionary): """Convert an iterable of ids to their corresponding words using a dictionary. This function abstracts away the differences between the HashDictionary and the standard one. + + Args: + ---- + ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). + This is the format returned by the topic_coherence.segmentation functions. """ - if not dictionary.id2token: + if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) top_words = set() @@ -72,7 +77,9 @@ def _get_co_occurrences(self, word_id1, word_id2): class UsesDictionary(BaseAnalyzer): - """Base class for corpus and text analyzers.""" + """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts. + The standard BaseAnalyzer can only deal with token ids since it does not have access to the token2id mapping. + """ def __init__(self, relevant_ids, dictionary): super(UsesDictionary, self).__init__(relevant_ids) @@ -90,17 +97,18 @@ def get_occurrences(self, word): word_id = word return self._get_occurrences(self.id2contiguous[word_id]) - def get_co_occurrences(self, word1, word2): - """Return number of docs the words co-occur in, once `accumulate` has been called.""" + def _word2_contiguous_id(self, word): try: - word_id1 = self.token2id[word1] - except KeyError: - word_id1 = word1 - try: - word_id2 = self.token2id[word2] + word_id = self.token2id[word] except KeyError: - word_id2 = word2 - return self._get_co_occurrences(self.id2contiguous[word_id1], self.id2contiguous[word_id2]) + word_id = word + return self.id2contiguous[word_id] + + def get_co_occurrences(self, word1, word2): + """Return number of docs the words co-occur in, once `accumulate` has been called.""" + word_id1 = self._word2_contiguous_id(word1) + word_id2 = self._word2_contiguous_id(word2) + return self._get_co_occurrences(word_id1, word_id2) class InvertedIndexBased(BaseAnalyzer): @@ -124,7 +132,7 @@ def index_to_dict(self): return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} -class CorpusAnalyzer(InvertedIndexBased): +class CorpusAccumulator(InvertedIndexBased): """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" def analyze_text(self, text): @@ -192,15 +200,21 @@ def __init__(self, *args): def analyze_text(self, window): relevant_words = list(self.filter_to_relevant_words(window)) - uniq_words = np.array(relevant_words) - self._occurrences[uniq_words] += 1 + if relevant_words: + uniq_words = np.array(relevant_words) + self._occurrences[uniq_words] += 1 - for combo in itertools.combinations(relevant_words, 2): - self._co_occurrences[combo] += 1 + for combo in itertools.combinations(relevant_words, 2): + self._co_occurrences[combo] += 1 def _symmetrize(self): + """Word pairs may have been encountered in (i, j) and (j, i) order. + Rather than enforcing a particular ordering during the update process, + we choose to symmetrize the co-occurrence matrix after accumulation has completed. + """ co_occ = self._co_occurrences - return co_occ + co_occ.T - np.diag(co_occ.diagonal()) + co_occ.setdiag(self._occurrences) # diagonal should be equal to occurrence counts + self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32') def accumulate(self, texts, window_size): super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) From 91b8a05e10686585e7c29af9b09af3572e00d469 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 25 May 2017 14:39:03 -0400 Subject: [PATCH 09/33] make wikicorpus parsing handle KeyboardInterrupt gracefully --- gensim/corpora/wikicorpus.py | 37 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index fb402da517..2d9b598a71 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -23,6 +23,7 @@ import re from xml.etree.cElementTree import iterparse # LXML isn't faster, so let's go with the built-in solution import multiprocessing +import signal from gensim import utils @@ -249,6 +250,10 @@ def process_article(args): return result, title, pageid +def init_worker(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + class WikiCorpus(TextCorpus): """ Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus. @@ -300,22 +305,26 @@ def get_texts(self): articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) - pool = multiprocessing.Pool(self.processes) + pool = multiprocessing.Pool(self.processes, init_worker) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... - for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): - for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): - articles_all += 1 - positions_all += len(tokens) - # article redirects and short stubs are pruned here - if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): - continue - articles += 1 - positions += len(tokens) - if self.metadata: - yield (tokens, (pageid, title)) - else: - yield tokens + try: + for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): + for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): + articles_all += 1 + positions_all += len(tokens) + # article redirects and short stubs are pruned here + if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): + continue + articles += 1 + positions += len(tokens) + if self.metadata: + yield (tokens, (pageid, title)) + else: + yield tokens + except KeyboardInterrupt: + pass + pool.terminate() logger.info( From c6224b7d6d02b069dd9f2731ee2de01e50719257 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Fri, 26 May 2017 19:23:04 -0400 Subject: [PATCH 10/33] add ParallelWordOccurrenceAccumulator and make default method for p_boolean_sliding_window; add parameter for CoherenceModel to adjust number of processes used, with default equal to max(1, cpu_count - 1) --- gensim/models/coherencemodel.py | 7 +- gensim/test/test_text_analysis.py | 30 +++- .../topic_coherence/probability_estimation.py | 16 +- gensim/topic_coherence/text_analysis.py | 158 +++++++++++++++++- gensim/utils.py | 10 +- 5 files changed, 200 insertions(+), 21 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 80e3b380d9..d0ff707457 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -20,6 +20,7 @@ import logging from collections import namedtuple +import multiprocessing as mp import numpy as np @@ -90,7 +91,7 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, - coherence='c_v', topn=10): + coherence='c_v', topn=10, processes=-1): """ Args: ---- @@ -178,6 +179,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= t_i.append(dictionary.token2id[topic[n]]) self.topics.append(np.array(t_i)) + self.processes = processes if processes > 1 else max(1, mp.cpu_count() - 1) self._accumulator = None def __str__(self): @@ -212,7 +214,8 @@ def get_coherence_per_topic(self): return measure.conf(segmented_topics, self._accumulator) self._accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) + dictionary=self.dictionary, window_size=self.window_size, + processes=self.processes) if self.coherence == 'c_v': return measure.conf(self.topics, segmented_topics, self._accumulator, 'nlr', 1) else: diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index 33a269f9d2..8ee08a2373 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -1,9 +1,8 @@ import logging import unittest -from collections import namedtuple from gensim.topic_coherence.text_analysis import \ - InvertedIndexAccumulator, WordOccurrenceAccumulator + InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator from gensim.corpora.dictionary import Dictionary @@ -22,8 +21,9 @@ class TextAnalyzerTestBase(unittest.TestCase): 'test': 21, 'document': 17 } - id2token = {v: k for k, v in token2id.items()} - dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + dictionary = Dictionary(texts) + dictionary.token2id = token2id + dictionary.id2token = {v: k for k, v in token2id.items()} top_ids = set(token2id.values()) texts2 = [['human', 'interface', 'computer'], @@ -42,8 +42,14 @@ class TextAnalyzerTestBase(unittest.TestCase): accumulator_cls = None + def init_accumulator(self): + return self.accumulator_cls(self.top_ids, self.dictionary) + + def init_accumulator2(self): + return self.accumulator_cls(self.top_ids2, self.dictionary2) + def test_occurrence_counting(self): - accumulator = self.accumulator_cls(self.top_ids, self.dictionary) \ + accumulator = self.init_accumulator()\ .accumulate(self.texts, 3) self.assertEqual(2, accumulator.get_occurrences("this")) self.assertEqual(1, accumulator.get_occurrences("is")) @@ -53,7 +59,7 @@ def test_occurrence_counting(self): self.assertEqual(1, accumulator.get_co_occurrences("is", "a")) def test_occurrence_counting2(self): - accumulator = self.accumulator_cls(self.top_ids2, self.dictionary2) \ + accumulator = self.init_accumulator2()\ .accumulate(self.texts2, 110) self.assertEqual(2, accumulator.get_occurrences("human")) self.assertEqual(4, accumulator.get_occurrences("user")) @@ -81,7 +87,7 @@ def test_occurrence_counting2(self): self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1)) def test_occurences_for_irrelevant_words(self): - accumulator = WordOccurrenceAccumulator(self.top_ids, self.dictionary) \ + accumulator = self.init_accumulator() \ .accumulate(self.texts, 2) with self.assertRaises(KeyError): accumulator.get_occurrences("irrelevant") @@ -125,6 +131,16 @@ class TestWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase): accumulator_cls = WordOccurrenceAccumulator +class TestParallelWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase): + accumulator_cls = ParallelWordOccurrenceAccumulator + + def init_accumulator(self): + return self.accumulator_cls(2, self.top_ids, self.dictionary) + + def init_accumulator2(self): + return self.accumulator_cls(2, self.top_ids2, self.dictionary2) + + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index f406e5a3e7..604fa07a24 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -13,7 +13,8 @@ import numpy as np -from gensim.topic_coherence.text_analysis import CorpusAccumulator, WordOccurrenceAccumulator +from gensim.topic_coherence.text_analysis import \ + CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator logger = logging.getLogger(__name__) @@ -50,11 +51,10 @@ def p_boolean_document(corpus, segmented_topics): num_docs : Total number of documents in corpus. """ top_ids = _ret_top_ids(segmented_topics) - accumulator = CorpusAccumulator(top_ids).accumulate(corpus) - return accumulator + return CorpusAccumulator(top_ids).accumulate(corpus) -def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): +def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1): """ This function performs the boolean sliding window probability estimation. Boolean sliding window determines word counts using a sliding window. The window moves over the documents one word token per step. @@ -74,5 +74,9 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id[0] : Total no of windows """ top_ids = _ret_top_ids(segmented_topics) - return WordOccurrenceAccumulator(top_ids, dictionary)\ - .accumulate(texts, window_size) + if processes <= 1: + accumulator = WordOccurrenceAccumulator(top_ids, dictionary) + else: + accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary) + logger.info("using %s to estimate probabilities from sliding windows" % accumulator) + return accumulator.accumulate(texts, window_size) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index a7ab9b815b..a9265347a3 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -9,13 +9,18 @@ statistical information about word occurrences. """ +import sys import itertools +import logging +import multiprocessing as mp import numpy as np import scipy.sparse as sps from gensim import utils +logger = logging.getLogger(__name__) + def _ids_to_words(ids, dictionary): """Convert an iterable of ids to their corresponding words using a dictionary. @@ -46,12 +51,20 @@ class BaseAnalyzer(object): def __init__(self, relevant_ids): self.relevant_ids = relevant_ids self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} + self.log_every = 1000 self._num_docs = 0 @property def num_docs(self): return self._num_docs + @num_docs.setter + def num_docs(self, num): + self._num_docs = num + if self._num_docs % self.log_every == 0: + logger.info("%s accumulated stats from %d documents" % ( + self.__class__.__name__, self._num_docs)) + def analyze_text(self, text): raise NotImplementedError("Base classes should implement analyze_text.") @@ -84,6 +97,7 @@ class UsesDictionary(BaseAnalyzer): def __init__(self, relevant_ids, dictionary): super(UsesDictionary, self).__init__(relevant_ids) self.relevant_words = _ids_to_words(self.relevant_ids, dictionary) + self.dictionary = dictionary self.token2id = dictionary.token2id def analyze_text(self, text): @@ -177,7 +191,7 @@ def accumulate(self, texts, window_size): relevant_texts = (text for text in texts if self.text_is_relevant(text)) for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False): self.analyze_text(virtual_document) - self._num_docs += 1 + self.num_docs += 1 return self @@ -198,6 +212,9 @@ def __init__(self, *args): self._occurrences = np.zeros(vocab_size, dtype='uint32') self._co_occurrences = sps.lil_matrix((vocab_size, vocab_size), dtype='uint32') + def __str__(self): + return self.__class__.__name__ + def analyze_text(self, window): relevant_words = list(self.filter_to_relevant_words(window)) if relevant_words: @@ -217,6 +234,7 @@ def _symmetrize(self): self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32') def accumulate(self, texts, window_size): + self._co_occurrences = self._co_occurrences.tolil() super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) self._symmetrize() return self @@ -226,3 +244,141 @@ def _get_occurrences(self, word_id): def _get_co_occurrences(self, word_id1, word_id2): return self._co_occurrences[word_id1, word_id2] + + def merge(self, other): + self._occurrences += other._occurrences + self._co_occurrences += other._co_occurrences + self._num_docs += other._num_docs + + +class _WordOccurrenceAccumulator(WordOccurrenceAccumulator): + """Monkey patched to avoid symmetrizing co-occurrence matrix after each batch.""" + def accumulate(self, texts, window_size): + TextsAnalyzer.accumulate(self, texts, window_size) + return self + + +class ParallelWordOccurrenceAccumulator(TextsAnalyzer): + """Accumulate word occurrences in parallel.""" + + def __init__(self, processes, *args, **kwargs): + super(ParallelWordOccurrenceAccumulator, self).__init__(*args) + if processes < 2: + raise ValueError("Must have at least 2 processes to run in parallel; got %d" % processes) + self.processes = processes + self.batch_size = kwargs.get('batch_size', 16) + + def __str__(self): + return "%s(processes=%s, batch_size=%s)" % ( + self.__class__.__name__, self.processes, self.batch_size) + + def accumulate(self, texts, window_size): + workers, input_q, output_q = self.start_workers(window_size) + try: + self.queue_all_texts(input_q, texts, window_size) + interrupted = False + except KeyboardInterrupt: + logger.warn("stats accumulation interrupted; <= %d documents processed" % self._num_docs) + interrupted = True + + accumulators = self.terminate_workers(input_q, output_q, workers, interrupted) + return self.merge_accumulators(accumulators) + + def start_workers(self, window_size): + input_q = mp.Queue(maxsize=self.processes) + output_q = mp.Queue() + workers = [] + for _ in range(self.processes): + accumulator = _WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) + worker = AccumulatingWorker(input_q, output_q, accumulator, window_size) + worker.start() + workers.append(worker) + + return workers, input_q, output_q + + def yield_batches(self, texts): + batch = [] + for text in texts: + batch.append(text) + if len(batch) == self.batch_size: + yield batch + batch = [] + + if batch: + yield batch + + def queue_all_texts(self, q, texts, window_size): + relevant_texts = (text for text in texts if self.text_is_relevant(text)) + for batch_num, batch in enumerate(self.yield_batches(relevant_texts)): + q.put(batch, block=True) + before = self._num_docs / self.log_every + self._num_docs += sum(len(doc) - window_size + 1 for doc in batch) + if before < (self._num_docs / self.log_every): + logger.info("submitted %d batches to accumulate stats from %d documents (%d virtual)" % ( + batch_num, batch_num * self.batch_size, self._num_docs)) + + def terminate_workers(self, input_q, output_q, workers, interrupted=False): + if not interrupted: + for _ in workers: + input_q.put(None, block=True) + + accumulators = [] + while len(accumulators) != len(workers): + accumulators.append(output_q.get()) + logger.info("%d accumulators retrieved from output queue" % len(accumulators)) + + for worker in workers: + if worker.is_alive(): + worker.terminate() + + input_q.close() + output_q.close() + return accumulators + + def merge_accumulators(self, accumulators): + accumulator = accumulators[0] + for other_accumulator in accumulators[1:]: + accumulator.merge(other_accumulator) + accumulator._symmetrize() + return accumulator + + +class AccumulatingWorker(mp.Process): + """Accumulate stats from texts fed in from queue.""" + + def __init__(self, input_q, output_q, accumulator, window_size): + super(AccumulatingWorker, self).__init__() + self.input_q = input_q + self.output_q = output_q + self.accumulator = accumulator + self.accumulator.log_every = sys.maxint # avoid logging in workers + self.window_size = window_size + + def run(self): + try: + self._run() + except KeyboardInterrupt: + logger.info("%s interrupted after processing %d documents" % ( + self.__class__.__name__, self.accumulator.num_docs)) + finally: + self.reply_to_master() + + def _run(self): + batch_num = 0 + n_docs = 0 + while True: + docs = self.input_q.get(block=True) + if docs is None: # sentinel value + break + + self.accumulator.accumulate(docs, self.window_size) + n_docs += len(docs) + logger.debug("completed batch %d; %d documents processed (%d virtual)" % ( + batch_num, n_docs, self.accumulator.num_docs)) + batch_num += 1 + + def reply_to_master(self): + logger.info("serializing accumulator to return to master...") + self.output_q.put(self.accumulator, block=False) + logger.info("accumulator serialized") + diff --git a/gensim/utils.py b/gensim/utils.py index 3a191f1a9a..7300b17abd 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1193,7 +1193,7 @@ def sample_dict(d, n=10, use_random=True): def strided_windows(ndarray, window_size): """ Produce a numpy.ndarray of windows, as from a sliding window. - + >>> strided_windows(np.arange(5), 2) array([[0, 1], [1, 2], @@ -1206,12 +1206,12 @@ def strided_windows(ndarray, window_size): [3, 4, 5, 6, 7], [4, 5, 6, 7, 8], [5, 6, 7, 8, 9]]) - + Args: ---- ndarray: either a numpy.ndarray or something that can be converted into one. window_size: sliding window size. - :param window_size: + :param window_size: :return: numpy.ndarray of the subsequences produced by sliding a window of the given size over the `ndarray`. Since this uses striding, the individual arrays are views rather than copies of `ndarray`. Changes to one view modifies the others and the original. @@ -1232,7 +1232,7 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True): """Produce a generator over the given texts using a sliding window of `window_size`. The windows produced are views of some subsequence of a text. To use deep copies instead, pass `copy=True`. - + Args: ---- texts: List of string sentences. @@ -1240,7 +1240,7 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True): copy: False to use views of the texts (default) or True to produce deep copies. ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). If False, the documents below `window_size` will be yielded as the full document. - + """ for document in texts: doc_windows = strided_windows(document, window_size) From f00d389a4f24bc1433023663aa55ecdaeb432530 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Fri, 26 May 2017 21:57:59 -0400 Subject: [PATCH 11/33] clean up, clarify, and optimize the indirect_confirmation_measure.cosine_similarity function --- gensim/test/test_indirect_confirmation.py | 25 ++- .../indirect_confirmation_measure.py | 175 +++++++++--------- 2 files changed, 107 insertions(+), 93 deletions(-) diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py index 8fca92a34a..6bdc8abe32 100644 --- a/gensim/test/test_indirect_confirmation.py +++ b/gensim/test/test_indirect_confirmation.py @@ -12,9 +12,11 @@ import unittest from gensim.topic_coherence import indirect_confirmation_measure +from gensim.topic_coherence import text_analysis +from gensim.corpora.dictionary import Dictionary import numpy as np -from numpy import array + class TestIndirectConfirmation(unittest.TestCase): def setUp(self): @@ -22,17 +24,21 @@ def setUp(self): # of this module. See the modules for the mathematical formulas self.topics = [np.array([1, 2])] # Result from s_one_set segmentation: - self.segmentation = [[(1, array([1, 2])), (2, array([1, 2]))]] - self.posting_list = {1: set([2, 3, 4]), 2: set([3, 5])} + self.segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] self.gamma = 1 self.measure = 'nlr' - self.num_docs = 5 + + dictionary = Dictionary() + dictionary.id2token = {1: 'fake', 2: 'tokens'} + self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + self.accumulator._num_docs = 5 def testCosineSimilarity(self): """Test cosine_similarity()""" - obtained = indirect_confirmation_measure.cosine_similarity(self.topics, self.segmentation, - self.posting_list, self.measure, - self.gamma, self.num_docs) + obtained = indirect_confirmation_measure.cosine_similarity( + self.topics, self.segmentation, self.accumulator, self.measure, self.gamma) + # The steps involved in this calculation are as follows: # 1. Take (1, array([1, 2]). Take w' which is 1. # 2. Calculate nlr(1, 1), nlr(1, 2). This is our first vector. @@ -41,8 +47,9 @@ def testCosineSimilarity(self): # 5. Find out cosine similarity between these two vectors. # 6. Similarly for the second segmentation. expected = [0.6230, 0.6230] # To account for EPSILON approximation - self.assertAlmostEqual(obtained[0], expected[0], 4) - self.assertAlmostEqual(obtained[1], expected[1], 4) + for i in range(len(expected)): + self.assertAlmostEqual(obtained[i], expected[i], 4) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index c4585ad677..8309e791c8 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -24,54 +24,16 @@ """ import logging +import itertools + +import scipy.sparse as sps import numpy as np from gensim.topic_coherence import direct_confirmation_measure -from gensim.matutils import cossim logger = logging.getLogger(__name__) -def _present(w_prime_star, w, w_backtrack): - """ - Internal helper function to return index of (w_prime_star, w) in w_backtrack. - Return -1 if not present. - """ - index = -1 - flag = 0 - for arr in w_backtrack: - index += 1 - if np.all(w_prime_star == arr[0]) and np.all(w == arr[1]): - flag += 1 - break - if not flag: - return -1 - return index - - -def _make_seg(w_prime, w, accumulator, measure, gamma, backtrack): - """ - Internal helper function to return context vectors for segmentations. - """ - context_vectors = {} - if isinstance(w_prime, np.ndarray): - for w_j in w: - for w_i in w_prime: - if (w_i, w_j) not in backtrack: - backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], accumulator, measure[1])[0] - if w_j not in context_vectors: - context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma - else: - context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma - else: - for w_j in w: - if (w_prime, w_j) not in backtrack: - backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], accumulator, measure[1])[0] - context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma - - return context_vectors, backtrack - - def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma): """ This function calculates the indirect cosine measure. Given context vectors @@ -88,56 +50,101 @@ def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma): ---- topics : Topics obtained from the trained topic model. segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. + accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors. - num_docs : Total number of documents in corresponding corpus. Returns: ------- s_cos_sim : array of cosine similarity of the context vectors for each segmentation """ - if measure == 'nlr': - # make normalized log ratio measure tuple - measure = (direct_confirmation_measure.log_ratio_measure, True) - else: - raise ValueError("The direct confirmation measure you entered is not currently supported.") - backtrack = {} # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2). - """ - For backtracking context vectors, we will create a list called w_backtrack to store (w_prime, w) or - (w_star, w) tuples and a corresponding list context_vector_backtrack which will create a - mapping of (w_prime or w_star, w) ---> context_vector. - """ - w_backtrack = [] - context_vector_backtrack = [] + context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) + s_cos_sim = [] - for top_words, s_i in zip(topics, segmented_topics): - for w_prime, w_star in s_i: - # Step 1. Check if (w_prime, top_words) tuple in w_backtrack. - # Step 2. If yes, return corresponding context vector - w_prime_index = _present(w_prime, top_words, w_backtrack) - if w_backtrack and w_prime_index != -1: - w_prime_context_vectors = context_vector_backtrack[w_prime_index] - else: - w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, accumulator, measure, gamma, backtrack) - backtrack.update(backtrack_i) - # Update backtracking lists - w_backtrack.append((w_prime, top_words)) - context_vector_backtrack.append(w_prime_context_vectors) - - # Step 1. Check if (w_star, top_words) tuple in w_backtrack. - # Step 2. If yes, check if corresponding w is the same - w_star_index = _present(w_star, top_words, w_backtrack) - if w_backtrack and w_star_index != -1: - w_star_context_vectors = context_vector_backtrack[w_star_index] - else: - w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, accumulator, measure, gamma, backtrack) - backtrack.update(backtrack_i) - # Update all backtracking lists - w_backtrack.append((w_star, top_words)) - context_vector_backtrack.append(w_star_context_vectors) - - s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items()) - s_cos_sim.append(s_cos_sim_i) + for topic_words, topic_segments in zip(topics, segmented_topics): + topic_words = tuple(topic_words) # because tuples are hashable + for w_prime, w_star in topic_segments: + w_prime_cv = context_vectors[w_prime, topic_words] + w_star_cv = context_vectors[w_star, topic_words] + s_cos_sim.append(_cossim(w_prime_cv, w_star_cv)) return s_cos_sim + + +class ContextVectorComputer(object): + """Lazily compute context vectors for topic segments.""" + + def __init__(self, measure, topics, accumulator, gamma): + if measure == 'nlr': + self.similarity = _pair_npmi + else: + raise ValueError("The direct confirmation measure you entered is not currently supported.") + + self.mapping = _map_to_contiguous(topics) + self.vocab_size = len(self.mapping) + self.accumulator = accumulator + self.gamma = gamma + self.sim_cache = {} # Cache similarities between tokens represented as pairs of word ids, e.g. (1, 2) + self.context_vector_cache = {} # mapping from (segment, topic_words) --> context_vector + + def __getitem__(self, idx): + return self.compute_context_vector(*idx) + + def compute_context_vector(self, segment_word_ids, topic_word_ids): + """ + Step 1. Check if (segment_word_ids, topic_word_ids) context vector has been cached. + Step 2. If yes, return corresponding context vector, else compute, cache, and return. + """ + key = _key_for_segment(segment_word_ids, topic_word_ids) + context_vector = self.context_vector_cache.get(key, None) + if context_vector is None: + context_vector = self._make_seg(segment_word_ids, topic_word_ids) + self.context_vector_cache[key] = context_vector + return context_vector + + def _make_seg(self, segment_word_ids, topic_word_ids): + """Internal helper function to return context vectors for segmentations.""" + context_vector = sps.lil_matrix((self.vocab_size, 1)) + if not hasattr(segment_word_ids, '__iter__'): + segment_word_ids = (segment_word_ids,) + + for w_j in topic_word_ids: + idx = (self.mapping[w_j], 0) + for pair in (tuple(sorted((w_i, w_j))) for w_i in segment_word_ids): + if pair not in self.sim_cache: + self.sim_cache[pair] = self.similarity(pair, self.accumulator) + + context_vector[idx] += self.sim_cache[pair] ** self.gamma + + return context_vector.tocsr() + + +def _pair_npmi(pair, accumulator): + """Compute normalized pairwise mutual information (NPMI) between a pair of words. + The pair is an iterable of (word_id1, word_id2). + """ + return direct_confirmation_measure.log_ratio_measure([[pair]], accumulator, True)[0] + + +def _cossim(cv1, cv2): + return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2)) + + +def _magnitude(sparse_vec): + return np.sqrt(np.sum(sparse_vec.data ** 2)) + + +def _map_to_contiguous(ids_iterable): + uniq_ids = {} + n = 0 + for id_ in itertools.chain.from_iterable(ids_iterable): + if id_ not in uniq_ids: + uniq_ids[id_] = n + n += 1 + return uniq_ids + + +def _key_for_segment(segment, topic_words): + """A segment may have a single number of an iterable of them.""" + segment_key = tuple(segment) if hasattr(segment, '__iter__') else segment + return segment_key, topic_words From 327b7391ab2a0e01646a4009076d75237bffee7a Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Mon, 29 May 2017 20:56:14 -0400 Subject: [PATCH 12/33] #1342: Cleanup, documentation improvements, proper caching of accumulator in CoherenceModel, and various test fixes. --- gensim/models/coherencemodel.py | 120 +++++++++++++----- gensim/test/test_indirect_confirmation.py | 2 +- gensim/test/test_probability_estimation.py | 22 ++-- .../indirect_confirmation_measure.py | 6 +- .../topic_coherence/probability_estimation.py | 35 +++-- gensim/topic_coherence/text_analysis.py | 83 ++++++++---- 6 files changed, 179 insertions(+), 89 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index d0ff707457..a29eefe5fc 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -31,12 +31,13 @@ from gensim.topic_coherence import (segmentation, probability_estimation, direct_confirmation_measure, indirect_confirmation_measure, aggregation) +from gensim.topic_coherence.probability_estimation import unique_ids_from_segments from gensim.utils import is_corpus, FakeDict logger = logging.getLogger(__name__) -boolean_document_based = ['u_mass'] -sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] +boolean_document_based = {'u_mass'} +sliding_window_based = {'c_v', 'c_uci', 'c_npmi'} make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') coherence_dict = { @@ -66,9 +67,7 @@ class CoherenceModel(interfaces.TransformationABC): - """ - Objects of this class allow for building and maintaining a model for topic - coherence. + """Objects of this class allow for building and maintaining a model for topic coherence. The main methods are: @@ -169,21 +168,57 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.topn = topn self.model = model - if model is not None: - self.topics = self._get_topics() - elif topics is not None: - self.topics = [] - for topic in topics: - t_i = [] - for n, _ in enumerate(topic): - t_i.append(dictionary.token2id[topic[n]]) - self.topics.append(np.array(t_i)) - self.processes = processes if processes > 1 else max(1, mp.cpu_count() - 1) self._accumulator = None + self._topics = None + self.topics = topics + + self.processes = processes if processes > 1 else max(1, mp.cpu_count() - 1) def __str__(self): - return coherence_dict[self.coherence].__str__() + return str(self.measure) + + @property + def measure(self): + return coherence_dict[self.coherence] + + @property + def topics(self): + return self._topics + + @topics.setter + def topics(self, topics): + new_topics = None + if self.model is not None: + new_topics = self._get_topics() + if topics is not None: + logger.warn("Ignoring topics you are attempting to set in favor of model's topics: %s" % self.model) + elif topics is not None: + new_topics = [] + for topic in topics: + t_i = np.array([self.dictionary.token2id[topic[n]] for n, _ in enumerate(topic)]) + new_topics.append(np.array(t_i)) + + if self._relevant_ids_will_differ(new_topics): + logger.debug("Wiping cached accumulator since it does not contain all relevant ids.") + self._accumulator = None + + self._topics = new_topics + + def _relevant_ids_will_differ(self, new_topics): + if not self._topics_differ(new_topics): + return False + + measure = self.measure + current_set = unique_ids_from_segments(measure.seg(self.topics)) + new_set = unique_ids_from_segments(measure.seg(new_topics)) + return not current_set.issuperset(new_set) + + def _topics_differ(self, new_topics): + return (new_topics is not None and + self._topics is not None and + self._accumulator is not None and + not np.equal(new_topics, self._topics).all()) def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" @@ -205,26 +240,49 @@ def _get_topics(self): "LdaModel, LdaVowpalWabbit and LdaMallet.") return topics - def get_coherence_per_topic(self): - measure = coherence_dict[self.coherence] - segmented_topics = measure.seg(self.topics) + def segment_topics(self): + return self.measure.seg(self.topics) + + def estimate_probabilities(self, segmented_topics=None): + """Accumulate word occurrences and co-occurrences from texts or corpus using + the optimal method for the chosen coherence metric. This operation may take + quite some time for the sliding window based coherence methods. + """ + if segmented_topics is None: + segmented_topics = self.segment_topics() if self.coherence in boolean_document_based: - self._accumulator = measure.prob(self.corpus, segmented_topics) - return measure.conf(segmented_topics, self._accumulator) - - self._accumulator = measure.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size, - processes=self.processes) - if self.coherence == 'c_v': - return measure.conf(self.topics, segmented_topics, self._accumulator, 'nlr', 1) + self._accumulator = self.measure.prob(self.corpus, segmented_topics) else: - normalize = self.coherence == 'c_npmi' - return measure.conf(segmented_topics, self._accumulator, normalize=normalize) + self._accumulator = self.measure.prob( + texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=self.window_size, + processes=self.processes) + + return self._accumulator + + def get_coherence_per_topic(self, segmented_topics=None): + """Return list of coherence values for each topic based on pipeline parameters.""" + measure = self.measure + if segmented_topics is None: + segmented_topics = measure.seg(self.topics) + if self._accumulator is None: + self.estimate_probabilities(segmented_topics) + + if self.coherence in boolean_document_based: + kwargs = {} + elif self.coherence == 'c_v': + kwargs = dict(topics=self.topics, measure='nlr', gamma=1) + else: + kwargs = dict(normalize=(self.coherence == 'c_npmi')) + + return measure.conf(segmented_topics, self._accumulator, **kwargs) def aggregate_measures(self, confirmed_measures): - measure = coherence_dict[self.coherence] - return measure.aggr(confirmed_measures) + """Aggregate the individual topic coherence measures using + the pipeline's aggregation function. + """ + return self.measure.aggr(confirmed_measures) def get_coherence(self): """Return coherence value based on pipeline parameters.""" diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py index 6bdc8abe32..aedd9eaa9a 100644 --- a/gensim/test/test_indirect_confirmation.py +++ b/gensim/test/test_indirect_confirmation.py @@ -37,7 +37,7 @@ def setUp(self): def testCosineSimilarity(self): """Test cosine_similarity()""" obtained = indirect_confirmation_measure.cosine_similarity( - self.topics, self.segmentation, self.accumulator, self.measure, self.gamma) + self.segmentation, self.accumulator, self.topics, self.measure, self.gamma) # The steps involved in this calculation are as follows: # 1. Take (1, array([1, 2]). Take w' which is 1. diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py index 68ac24e752..f87b7bc564 100644 --- a/gensim/test/test_probability_estimation.py +++ b/gensim/test/test_probability_estimation.py @@ -56,17 +56,20 @@ def setUp(self): def testPBooleanDocument(self): """Test p_boolean_document()""" # Unique topic ids are 5798, 10608, 12736 and 18451 - obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) + accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) + obtained = accumulator.index_to_dict() expected = {18451: {5}, 12736: {1, 3}, 5798: {1, 2}, 10608: {0}} self.assertEqual(expected, obtained) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. - obtained, _ = probability_estimation.p_boolean_sliding_window( + accumulator = probability_estimation.p_boolean_sliding_window( self.texts, self.segmented_topics, self.dictionary, 2) - expected = {10608: {1}, 12736: {8, 2, 3}, 18451: {10}, 5798: {4, 5, 6, 7}} - self.assertEqual(expected, obtained) + self.assertEqual(1, accumulator[10608]) + self.assertEqual(3, accumulator[12736]) + self.assertEqual(1, accumulator[18451]) + self.assertEqual(4, accumulator[5798]) class TestProbabilityEstimationWithNormalDictionary(ProbabilityEstimationBase): @@ -100,17 +103,20 @@ def setUp(self): def testPBooleanDocument(self): """Test p_boolean_document()""" - obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) + accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) + obtained = accumulator.index_to_dict() expected = {9: {5}, 3: {1, 3}, 4: {1, 2}, 1: {0}} self.assertEqual(expected, obtained) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. - obtained, _ = probability_estimation.p_boolean_sliding_window( + accumulator = probability_estimation.p_boolean_sliding_window( self.texts, self.segmented_topics, self.dictionary, 2) - expected = {1: {1}, 3: {8, 2, 3}, 9: {10}, 4: {4, 5, 6, 7}} - self.assertEqual(expected, obtained) + self.assertEqual(1, accumulator[1]) + self.assertEqual(3, accumulator[3]) + self.assertEqual(1, accumulator[9]) + self.assertEqual(4, accumulator[4]) if __name__ == '__main__': diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 8309e791c8..eccfb0a3b5 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -34,7 +34,7 @@ logger = logging.getLogger(__name__) -def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma): +def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1): """ This function calculates the indirect cosine measure. Given context vectors _ _ _ _ @@ -48,11 +48,11 @@ def cosine_similarity(topics, segmented_topics, accumulator, measure, gamma): Args: ---- - topics : Topics obtained from the trained topic model. segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). + topics : Topics obtained from the trained topic model. measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). - gamma : Gamma value for computing W', W* vectors. + gamma : Gamma value for computing W', W* vectors; default is 1. Returns: ------- diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 604fa07a24..fb583b99fc 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -11,30 +11,12 @@ import logging import itertools -import numpy as np - from gensim.topic_coherence.text_analysis import \ CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator logger = logging.getLogger(__name__) -def _ret_top_ids(segmented_topics): - """ - Helper function to return a set of all the unique topic ids in segmented topics. - """ - top_ids = set() # is a set of all the unique ids contained in topics. - for s_i in segmented_topics: - for word_id in itertools.chain.from_iterable(s_i): - if isinstance(word_id, np.ndarray): - for i in word_id: - top_ids.add(i) - else: - top_ids.add(word_id) - - return top_ids - - def p_boolean_document(corpus, segmented_topics): """ This function performs the boolean document probability estimation. Boolean document estimates the probability @@ -50,7 +32,7 @@ def p_boolean_document(corpus, segmented_topics): per_topic_postings : Boolean document posting list for each unique topic id. num_docs : Total number of documents in corpus. """ - top_ids = _ret_top_ids(segmented_topics) + top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) @@ -73,10 +55,23 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p per_topic_postings : Boolean sliding window postings list of all the unique topic ids. window_id[0] : Total no of windows """ - top_ids = _ret_top_ids(segmented_topics) + top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: accumulator = WordOccurrenceAccumulator(top_ids, dictionary) else: accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary) logger.info("using %s to estimate probabilities from sliding windows" % accumulator) return accumulator.accumulate(texts, window_size) + + +def unique_ids_from_segments(segmented_topics): + """Return the set of all unique ids in a list of segmented topics.""" + top_ids = set() # is a set of all the unique ids contained in topics. + for s_i in segmented_topics: + for word_id in itertools.chain.from_iterable(s_i): + if hasattr(word_id, '__iter__'): + top_ids = top_ids.union(word_id) + else: + top_ids.add(word_id) + + return top_ids diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index a9265347a3..180d378e4b 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -25,7 +25,7 @@ def _ids_to_words(ids, dictionary): """Convert an iterable of ids to their corresponding words using a dictionary. This function abstracts away the differences between the HashDictionary and the standard one. - + Args: ---- ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). @@ -159,20 +159,21 @@ def analyze_text(self, text): def accumulate(self, corpus): for document in corpus: self.analyze_text(document) - self._num_docs += 1 + self.num_docs += 1 return self -class TextsAnalyzer(UsesDictionary): - """Gather some statistics about relevant terms a corpus by iterating over texts.""" +class WindowedTextsAnalyzer(UsesDictionary): + """Gather some statistics about relevant terms of a corpus by iterating over windows of texts.""" def __init__(self, relevant_ids, dictionary): """ Args: ---- - relevant_words: the set of words that occurrences should be accumulated for. + relevant_ids: the set of words that occurrences should be accumulated for. + dictionary: gensim.corpora.dictionary.Dictionary instance with mappings for the relevant_ids. """ - super(TextsAnalyzer, self).__init__(relevant_ids, dictionary) + super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) def filter_to_relevant_words(self, text): """Lazily filter the text to only those words which are relevant.""" @@ -195,7 +196,7 @@ def accumulate(self, texts, window_size): return self -class InvertedIndexAccumulator(TextsAnalyzer, InvertedIndexBased): +class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): """Build an inverted index from a sequence of corpus texts.""" def analyze_text(self, window): @@ -203,8 +204,8 @@ def analyze_text(self, window): self._inverted_index[word_id].add(self._num_docs) -class WordOccurrenceAccumulator(TextsAnalyzer): - """Accumulate word occurrences and co-occurrences from a corpus of texts.""" +class WordOccurrenceAccumulator(WindowedTextsAnalyzer): + """Accumulate word occurrences and co-occurrences from a sequence of corpus texts.""" def __init__(self, *args): super(WordOccurrenceAccumulator, self).__init__(*args) @@ -224,6 +225,20 @@ def analyze_text(self, window): for combo in itertools.combinations(relevant_words, 2): self._co_occurrences[combo] += 1 + def accumulate(self, texts, window_size): + self._co_occurrences = self._co_occurrences.tolil() + self.partial_accumulate(texts, window_size) + self._symmetrize() + return self + + def partial_accumulate(self, texts, window_size): + """Meant to be called several times to accumulate partial results. The final + accumulation should be performed with the `accumulate` method as opposed to this one. + This method does not ensure the co-occurrence matrix is in lil format and does not + symmetrize it after accumulation. + """ + super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) + def _symmetrize(self): """Word pairs may have been encountered in (i, j) and (j, i) order. Rather than enforcing a particular ordering during the update process, @@ -233,12 +248,6 @@ def _symmetrize(self): co_occ.setdiag(self._occurrences) # diagonal should be equal to occurrence counts self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32') - def accumulate(self, texts, window_size): - self._co_occurrences = self._co_occurrences.tolil() - super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) - self._symmetrize() - return self - def _get_occurrences(self, word_id): return self._occurrences[word_id] @@ -251,14 +260,7 @@ def merge(self, other): self._num_docs += other._num_docs -class _WordOccurrenceAccumulator(WordOccurrenceAccumulator): - """Monkey patched to avoid symmetrizing co-occurrence matrix after each batch.""" - def accumulate(self, texts, window_size): - TextsAnalyzer.accumulate(self, texts, window_size) - return self - - -class ParallelWordOccurrenceAccumulator(TextsAnalyzer): +class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): """Accumulate word occurrences in parallel.""" def __init__(self, processes, *args, **kwargs): @@ -285,11 +287,17 @@ def accumulate(self, texts, window_size): return self.merge_accumulators(accumulators) def start_workers(self, window_size): + """Set up an input and output queue and start processes for each worker. + + The input queue is used to transmit batches of documents to the workers. + The output queue is used by workers to transmit the WordOccurrenceAccumulator instances. + Returns: tuple of (list of workers, input queue, output queue). + """ input_q = mp.Queue(maxsize=self.processes) output_q = mp.Queue() workers = [] for _ in range(self.processes): - accumulator = _WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) + accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) worker = AccumulatingWorker(input_q, output_q, accumulator, window_size) worker.start() workers.append(worker) @@ -297,6 +305,9 @@ def start_workers(self, window_size): return workers, input_q, output_q def yield_batches(self, texts): + """Return a generator over the given texts that yields batches of + `batch_size` texts at a time. + """ batch = [] for text in texts: batch.append(text) @@ -308,6 +319,9 @@ def yield_batches(self, texts): yield batch def queue_all_texts(self, q, texts, window_size): + """Sequentially place batches of texts on the given queue until `texts` is consumed. + The texts are filtered so that only those with at least one relevant token are queued. + """ relevant_texts = (text for text in texts if self.text_is_relevant(text)) for batch_num, batch in enumerate(self.yield_batches(relevant_texts)): q.put(batch, block=True) @@ -318,6 +332,18 @@ def queue_all_texts(self, q, texts, window_size): batch_num, batch_num * self.batch_size, self._num_docs)) def terminate_workers(self, input_q, output_q, workers, interrupted=False): + """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, + then terminate each. We do not use join here because it has been shown to have some issues + in Python 2.7 (and even in later versions). This method also closes both the input and output + queue. + + If `interrupted` is False (normal execution), a None value is placed on the input queue for + each worker. The workers are looking for this sentinel value and interpret it as a signal to + terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are + programmed to recover from this and continue on to transmit their results before terminating. + So in this instance, the sentinel values are not queued, but the rest of the execution + continues as usual. + """ if not interrupted: for _ in workers: input_q.put(None, block=True) @@ -336,9 +362,15 @@ def terminate_workers(self, input_q, output_q, workers, interrupted=False): return accumulators def merge_accumulators(self, accumulators): + """Merge the list of accumulators into a single `WordOccurrenceAccumulator` with all + occurrence and co-occurrence counts, and a `num_docs` that reflects the total observed + by all the individual accumulators. + """ accumulator = accumulators[0] for other_accumulator in accumulators[1:]: accumulator.merge(other_accumulator) + # Workers perform partial accumulation, so none of the co-occurrence matrices are symmetrized. + # This is by design, to avoid unnecessary matrix additions during accumulation. accumulator._symmetrize() return accumulator @@ -371,7 +403,7 @@ def _run(self): if docs is None: # sentinel value break - self.accumulator.accumulate(docs, self.window_size) + self.accumulator.partial_accumulate(docs, self.window_size) n_docs += len(docs) logger.debug("completed batch %d; %d documents processed (%d virtual)" % ( batch_num, n_docs, self.accumulator.num_docs)) @@ -381,4 +413,3 @@ def reply_to_master(self): logger.info("serializing accumulator to return to master...") self.output_q.put(self.accumulator, block=False) logger.info("accumulator serialized") - From e06c7c3c53dcaebf727da89c9f24b0af790a9fce Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Tue, 30 May 2017 05:41:50 -0400 Subject: [PATCH 13/33] #1342: Do not swallow `KeyboardInterrupt` naively in `WikiCorpus.get_texts`; instead, log warning and do not set `length`. --- gensim/corpora/wikicorpus.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 209946fb41..13b111db4f 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -250,7 +250,8 @@ def process_article(args): return result, title, pageid -def init_worker(): +def init_to_ignore_interrupt(): + """Should only be used when master is prepared to handle termination of child processes.""" signal.signal(signal.SIGINT, signal.SIG_IGN) @@ -304,13 +305,16 @@ def get_texts(self): """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 - texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) - pool = multiprocessing.Pool(self.processes, init_worker) - # process the corpus in smaller chunks of docs, because multiprocessing.Pool - # is dumb and would load the entire input into RAM at once... + texts = ((text, self.lemmatize, title, pageid) + for title, text, pageid + in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) + pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) + try: + # process the corpus in smaller chunks of docs, because multiprocessing.Pool + # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): - for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): + for tokens, title, pageid in pool.imap(process_article, group): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here @@ -323,13 +327,15 @@ def get_texts(self): else: yield tokens except KeyboardInterrupt: - pass - - pool.terminate() - - logger.info( - "finished iterating over Wikipedia corpus of %i documents with %i positions" - " (total %i articles, %i positions before pruning articles shorter than %i words)", - articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) - self.length = articles # cache corpus length + logger.warn("user terminated iteration over Wikipedia corpus after %i documents with %i positions" + " (total %i articles, %i positions before pruning articles shorter than %i words)", + articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) + else: + logger.info( + "finished iterating over Wikipedia corpus of %i documents with %i positions" + " (total %i articles, %i positions before pruning articles shorter than %i words)", + articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) + self.length = articles # cache corpus length + finally: + pool.terminate() # endclass WikiCorpus From 2ca43f7378e962e33d7be4e836444ad2bfbe0117 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Tue, 30 May 2017 05:47:01 -0400 Subject: [PATCH 14/33] #1342: Formatting fixes (hanging indent in `coherencemodel` and non-empty blank lines in `text_analysis`. --- gensim/models/coherencemodel.py | 6 +++--- gensim/topic_coherence/text_analysis.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index a29eefe5fc..cff32fe2c2 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -216,9 +216,9 @@ def _relevant_ids_will_differ(self, new_topics): def _topics_differ(self, new_topics): return (new_topics is not None and - self._topics is not None and - self._accumulator is not None and - not np.equal(new_topics, self._topics).all()) + self._topics is not None and + self._accumulator is not None and + not np.equal(new_topics, self._topics).all()) def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 180d378e4b..7b12572fb8 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -288,7 +288,7 @@ def accumulate(self, texts, window_size): def start_workers(self, window_size): """Set up an input and output queue and start processes for each worker. - + The input queue is used to transmit batches of documents to the workers. The output queue is used by workers to transmit the WordOccurrenceAccumulator instances. Returns: tuple of (list of workers, input queue, output queue). @@ -336,7 +336,7 @@ def terminate_workers(self, input_q, output_q, workers, interrupted=False): then terminate each. We do not use join here because it has been shown to have some issues in Python 2.7 (and even in later versions). This method also closes both the input and output queue. - + If `interrupted` is False (normal execution), a None value is placed on the input queue for each worker. The workers are looking for this sentinel value and interpret it as a signal to terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are From 825b0e9f8f60b1f6c217f54eca3fb213b4e9e80a Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Tue, 30 May 2017 06:03:47 -0400 Subject: [PATCH 15/33] #1342: Improve `CoherenceModel` documentation and minor refactor for variable interpretability. --- gensim/models/coherencemodel.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index cff32fe2c2..15d680a06c 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -73,6 +73,19 @@ class CoherenceModel(interfaces.TransformationABC): 1. constructor, which initializes the four stage pipeline by accepting a coherence measure, 2. the ``get_coherence()`` method, which returns the topic coherence. + + Pipeline phases can also be executed individually. Methods for doing this are: + + 1. `segment_topics()`, which performs segmentation of the given topics into their comparison sets. + 2. `estimate_probabilities()`, which accumulates word occurrence stats from the given corpus or texts. + The output of this is also cached on the `CoherenceModel`, so calling this method can be used as + a precomputation step for the next phase. + 3. `get_coherence_per_topic()`, which uses the segmented topics and estimated probabilities to compute + the coherence of each topic. This output can be used to rank topics in order of most coherent to + least. Such a ranking is useful if the intended use case of a topic model is document exploration + by a human. It is also useful for filtering out incoherent topics (keep top-n from ranked list). + 4. `aggregate_measures(topic_coherences)`, which uses the pipeline's aggregation method to compute + the overall coherence from the topic coherences. One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided if the model does not contain a dictionary already:: @@ -108,8 +121,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] corpus : Gensim document corpus. - dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. - If both are provided, dictionary will be used. + dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, + this is not needed. If both are provided, dictionary will be used. window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If left 'None' the default window sizes are used which are: @@ -121,9 +134,12 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= 'c_v' 'c_uci' also popularly known as c_pmi 'c_npmi' - For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. - For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed. + For 'u_mass' corpus should be provided. If texts is provided, it will be converted + to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. + Corpus is not needed. topn : Integer corresponding to the number of top words to be extracted from each topic. + processes : number of processes to use for probability estimation phase; any value less than 1 will be + interpreted to mean num_cpus - 1; default is -1. """ if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") @@ -196,8 +212,8 @@ def topics(self, topics): elif topics is not None: new_topics = [] for topic in topics: - t_i = np.array([self.dictionary.token2id[topic[n]] for n, _ in enumerate(topic)]) - new_topics.append(np.array(t_i)) + topic_token_ids = np.array([self.dictionary.token2id[token] for token in topic]) + new_topics.append(topic_token_ids) if self._relevant_ids_will_differ(new_topics): logger.debug("Wiping cached accumulator since it does not contain all relevant ids.") @@ -278,11 +294,11 @@ def get_coherence_per_topic(self, segmented_topics=None): return measure.conf(segmented_topics, self._accumulator, **kwargs) - def aggregate_measures(self, confirmed_measures): + def aggregate_measures(self, topic_coherences): """Aggregate the individual topic coherence measures using the pipeline's aggregation function. """ - return self.measure.aggr(confirmed_measures) + return self.measure.aggr(topic_coherences) def get_coherence(self): """Return coherence value based on pipeline parameters.""" From 314a400912ead837e99f2ce30e9be2cbe0381ff9 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Tue, 30 May 2017 17:03:44 -0400 Subject: [PATCH 16/33] #1342: Optimize word occurrence accumulation and fix a bug with repeated counting of tokens that occur more than once in a window. --- gensim/test/test_text_analysis.py | 20 ++++--- gensim/topic_coherence/text_analysis.py | 72 ++++++++++++++++--------- gensim/utils.py | 27 ++++++---- 3 files changed, 77 insertions(+), 42 deletions(-) diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index 8ee08a2373..ed6d482b44 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -12,7 +12,8 @@ class TextAnalyzerTestBase(unittest.TestCase): texts = [ ['this', 'is', 'a'], ['test', 'document'], - ['this', 'test', 'document'] + ['this', 'test', 'document'], + ['test', 'test', 'this'] ] token2id = { 'this': 10, @@ -51,11 +52,12 @@ def init_accumulator2(self): def test_occurrence_counting(self): accumulator = self.init_accumulator()\ .accumulate(self.texts, 3) - self.assertEqual(2, accumulator.get_occurrences("this")) + self.assertEqual(3, accumulator.get_occurrences("this")) self.assertEqual(1, accumulator.get_occurrences("is")) self.assertEqual(1, accumulator.get_occurrences("a")) self.assertEqual(2, accumulator.get_co_occurrences("test", "document")) + self.assertEqual(2, accumulator.get_co_occurrences("test", "this")) self.assertEqual(1, accumulator.get_co_occurrences("is", "a")) def test_occurrence_counting2(self): @@ -101,13 +103,14 @@ class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase): def test_accumulate1(self): accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\ .accumulate(self.texts, 2) - # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], ['test', 'document']] + # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'], + # ['test', 'document'], ['test', 'test'], ['test', 'this']] inverted_index = accumulator.index_to_dict() expected = { - 10: {0, 3}, + 10: {0, 3, 6}, 15: {0, 1}, 20: {1}, - 21: {2, 3, 4}, + 21: {2, 3, 4, 5, 6}, 17: {2, 4} } self.assertDictEqual(expected, inverted_index) @@ -115,13 +118,14 @@ def test_accumulate1(self): def test_accumulate2(self): accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary) \ .accumulate(self.texts, 3) - # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document']] + # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document'], + # ['test', 'test', 'this'] inverted_index = accumulator.index_to_dict() expected = { - 10: {0, 2}, + 10: {0, 2, 3}, 15: {0}, 20: {0}, - 21: {1, 2}, + 21: {1, 2, 3}, 17: {1, 2} } self.assertDictEqual(expected, inverted_index) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 7b12572fb8..b2b43e9382 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -65,7 +65,7 @@ def num_docs(self, num): logger.info("%s accumulated stats from %d documents" % ( self.__class__.__name__, self._num_docs)) - def analyze_text(self, text): + def analyze_text(self, text, doc_num=None): raise NotImplementedError("Base classes should implement analyze_text.") def __getitem__(self, word_or_words): @@ -100,9 +100,6 @@ def __init__(self, relevant_ids, dictionary): self.dictionary = dictionary self.token2id = dictionary.token2id - def analyze_text(self, text): - raise NotImplementedError("Base classes should implement analyze_text.") - def get_occurrences(self, word): """Return number of docs the word occurs in, once `accumulate` has been called.""" try: @@ -149,7 +146,7 @@ def index_to_dict(self): class CorpusAccumulator(InvertedIndexBased): """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" - def analyze_text(self, text): + def analyze_text(self, text, doc_num=None): doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) if len(top_ids_in_doc) > 0: @@ -164,7 +161,7 @@ def accumulate(self, corpus): class WindowedTextsAnalyzer(UsesDictionary): - """Gather some statistics about relevant terms of a corpus by iterating over windows of texts.""" + """Gather some stats about relevant terms of a corpus by iterating over windows of texts.""" def __init__(self, relevant_ids, dictionary): """ @@ -181,6 +178,22 @@ def filter_to_relevant_words(self, text): relevant_ids = (self.token2id[word] for word in relevant_words) return (self.id2contiguous[word_id] for word_id in relevant_ids) + def accumulate(self, texts, window_size): + relevant_texts = self._iter_texts(texts) + windows = utils.iter_windows(relevant_texts, window_size, ignore_below_size=False, + include_doc_num=True) + for doc_num, virtual_document in windows: + self.analyze_text(virtual_document, doc_num) + self.num_docs += 1 + return self + + def _iter_texts(self, texts): + for text in texts: + if self.text_is_relevant(text): + token_ids = (self.token2id[word] if word in self.relevant_words else None + for word in text) + yield [self.id2contiguous[_id] if _id is not None else None for _id in token_ids] + def text_is_relevant(self, text): """Return True if the text has any relevant words, else False.""" for word in text: @@ -188,20 +201,14 @@ def text_is_relevant(self, text): return True return False - def accumulate(self, texts, window_size): - relevant_texts = (text for text in texts if self.text_is_relevant(text)) - for virtual_document in utils.iter_windows(relevant_texts, window_size, ignore_below_size=False): - self.analyze_text(virtual_document) - self.num_docs += 1 - return self - class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): """Build an inverted index from a sequence of corpus texts.""" - def analyze_text(self, window): - for word_id in self.filter_to_relevant_words(window): - self._inverted_index[word_id].add(self._num_docs) + def analyze_text(self, window, doc_num=None): + for word_id in window: + if word_id is not None: + self._inverted_index[word_id].add(self._num_docs) class WordOccurrenceAccumulator(WindowedTextsAnalyzer): @@ -216,15 +223,6 @@ def __init__(self, *args): def __str__(self): return self.__class__.__name__ - def analyze_text(self, window): - relevant_words = list(self.filter_to_relevant_words(window)) - if relevant_words: - uniq_words = np.array(relevant_words) - self._occurrences[uniq_words] += 1 - - for combo in itertools.combinations(relevant_words, 2): - self._co_occurrences[combo] += 1 - def accumulate(self, texts, window_size): self._co_occurrences = self._co_occurrences.tolil() self.partial_accumulate(texts, window_size) @@ -237,7 +235,31 @@ def partial_accumulate(self, texts, window_size): This method does not ensure the co-occurrence matrix is in lil format and does not symmetrize it after accumulation. """ + self._current_doc_num = -1 + self._token_at_edge = None super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) + return self + + def analyze_text(self, window, doc_num=None): + if doc_num != self._current_doc_num: + self._uniq_words = set(window) + self._uniq_words.discard(None) + self._token_at_edge = window[0] + self._current_doc_num = doc_num + else: + if self._token_at_edge is not None: + self._uniq_words.remove(self._token_at_edge) + self._token_at_edge = window[0] + + if window[-1] is not None: + self._uniq_words.add(window[-1]) + + if self._uniq_words: + words_idx = np.array(list(self._uniq_words)) + self._occurrences[words_idx] += 1 + + for combo in itertools.combinations(words_idx, 2): + self._co_occurrences[combo] += 1 def _symmetrize(self): """Word pairs may have been encountered in (i, j) and (j, i) order. diff --git a/gensim/utils.py b/gensim/utils.py index f0488d2943..8b57871d5a 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1229,7 +1229,7 @@ def strided_windows(ndarray, window_size): strides=(stride, stride)) -def iter_windows(texts, window_size, copy=False, ignore_below_size=True): +def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False): """Produce a generator over the given texts using a sliding window of `window_size`. The windows produced are views of some subsequence of a text. To use deep copies instead, pass `copy=True`. @@ -1243,11 +1243,20 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True): If False, the documents below `window_size` will be yielded as the full document. """ - for document in texts: - doc_windows = strided_windows(document, window_size) - if doc_windows.shape[0] == 0: - if not ignore_below_size: - yield document.copy() if copy else document - else: - for doc_window in doc_windows: - yield doc_window.copy() if copy else doc_window + for doc_num, document in enumerate(texts): + for window in _iter_windows(document, window_size, copy, ignore_below_size): + if include_doc_num: + yield (doc_num, window) + else: + yield window + + +def _iter_windows(document, window_size, copy=False, ignore_below_size=True): + doc_windows = strided_windows(document, window_size) + if doc_windows.shape[0] == 0: + if not ignore_below_size: + yield document.copy() if copy else document + else: + for doc_window in doc_windows: + yield doc_window.copy() if copy else doc_window + From e7857734f0d44b71b80ec4e3f3ef1ef6bb2eaa47 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 31 May 2017 10:42:32 -0400 Subject: [PATCH 17/33] #1342: Minor bug fixes and improved logging in text_analysis module; cleaned up spacing in coherencemodel. --- gensim/models/coherencemodel.py | 4 ++-- gensim/topic_coherence/text_analysis.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 15d680a06c..adcac0f27a 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -73,9 +73,9 @@ class CoherenceModel(interfaces.TransformationABC): 1. constructor, which initializes the four stage pipeline by accepting a coherence measure, 2. the ``get_coherence()`` method, which returns the topic coherence. - + Pipeline phases can also be executed individually. Methods for doing this are: - + 1. `segment_topics()`, which performs segmentation of the given topics into their comparison sets. 2. `estimate_probabilities()`, which accumulates word occurrence stats from the given corpus or texts. The output of this is also cached on the `CoherenceModel`, so calling this method can be used as diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index b2b43e9382..0a61c5ba0e 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -10,8 +10,9 @@ """ import sys -import itertools import logging +import itertools +import traceback import multiprocessing as mp import numpy as np @@ -248,7 +249,7 @@ def analyze_text(self, window, doc_num=None): self._current_doc_num = doc_num else: if self._token_at_edge is not None: - self._uniq_words.remove(self._token_at_edge) + self._uniq_words.discard(self._token_at_edge) # may be irrelevant token self._token_at_edge = window[0] if window[-1] is not None: @@ -351,7 +352,7 @@ def queue_all_texts(self, q, texts, window_size): self._num_docs += sum(len(doc) - window_size + 1 for doc in batch) if before < (self._num_docs / self.log_every): logger.info("submitted %d batches to accumulate stats from %d documents (%d virtual)" % ( - batch_num, batch_num * self.batch_size, self._num_docs)) + batch_num, (batch_num + 1) * self.batch_size, self._num_docs)) def terminate_workers(self, input_q, output_q, workers, interrupted=False): """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, @@ -394,6 +395,8 @@ def merge_accumulators(self, accumulators): # Workers perform partial accumulation, so none of the co-occurrence matrices are symmetrized. # This is by design, to avoid unnecessary matrix additions during accumulation. accumulator._symmetrize() + logger.info("accumulated word occurrence stats for %d virtual documents" % + accumulator.num_docs) return accumulator @@ -411,9 +414,13 @@ def __init__(self, input_q, output_q, accumulator, window_size): def run(self): try: self._run() + print("finished normally") except KeyboardInterrupt: logger.info("%s interrupted after processing %d documents" % ( self.__class__.__name__, self.accumulator.num_docs)) + except Exception as e: + logger.error("worker encountered unexpected exception: %s" % e) + logger.error(traceback.format_exc()) finally: self.reply_to_master() @@ -423,6 +430,7 @@ def _run(self): while True: docs = self.input_q.get(block=True) if docs is None: # sentinel value + logger.debug("observed sentinel value; terminating") break self.accumulator.partial_accumulate(docs, self.window_size) From 5f78cdb2bcea50975fcf8cabb3565f337406ed59 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 31 May 2017 14:03:32 -0400 Subject: [PATCH 18/33] #1342: Optimize data structures being used for window set tracking and avoid undue network traffic by moving relevancy filtering and token conversion to the master process. --- gensim/topic_coherence/text_analysis.py | 123 ++++++++++++++---------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 0a61c5ba0e..81989992d9 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -51,6 +51,7 @@ class BaseAnalyzer(object): def __init__(self, relevant_ids): self.relevant_ids = relevant_ids + self._vocab_size = len(self.relevant_ids) self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} self.log_every = 1000 self._num_docs = 0 @@ -92,7 +93,8 @@ def _get_co_occurrences(self, word_id1, word_id2): class UsesDictionary(BaseAnalyzer): """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts. - The standard BaseAnalyzer can only deal with token ids since it does not have access to the token2id mapping. + The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id + mapping. """ def __init__(self, relevant_ids, dictionary): @@ -128,8 +130,7 @@ class InvertedIndexBased(BaseAnalyzer): def __init__(self, *args): super(InvertedIndexBased, self).__init__(*args) - vocab_size = len(self.relevant_ids) - self._inverted_index = np.array([set() for _ in range(vocab_size)]) + self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) def _get_occurrences(self, word_id): return len(self._inverted_index[word_id]) @@ -169,15 +170,10 @@ def __init__(self, relevant_ids, dictionary): Args: ---- relevant_ids: the set of words that occurrences should be accumulated for. - dictionary: gensim.corpora.dictionary.Dictionary instance with mappings for the relevant_ids. + dictionary: Dictionary instance with mappings for the relevant_ids. """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) - - def filter_to_relevant_words(self, text): - """Lazily filter the text to only those words which are relevant.""" - relevant_words = (word for word in text if word in self.relevant_words) - relevant_ids = (self.token2id[word] for word in relevant_words) - return (self.id2contiguous[word_id] for word_id in relevant_ids) + self._none_token = self._vocab_size # see _iter_texts for use of none token def accumulate(self, texts, window_size): relevant_texts = self._iter_texts(texts) @@ -189,11 +185,13 @@ def accumulate(self, texts, window_size): return self def _iter_texts(self, texts): + dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32 for text in texts: if self.text_is_relevant(text): - token_ids = (self.token2id[word] if word in self.relevant_words else None - for word in text) - yield [self.id2contiguous[_id] if _id is not None else None for _id in token_ids] + yield np.array([ + self.id2contiguous[self.token2id[w]] if w in self.relevant_words + else self._none_token + for w in text], dtype=dtype) def text_is_relevant(self, text): """Return True if the text has any relevant words, else False.""" @@ -208,7 +206,7 @@ class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): def analyze_text(self, window, doc_num=None): for word_id in window: - if word_id is not None: + if word_id is not self._none_token: self._inverted_index[word_id].add(self._num_docs) @@ -217,9 +215,11 @@ class WordOccurrenceAccumulator(WindowedTextsAnalyzer): def __init__(self, *args): super(WordOccurrenceAccumulator, self).__init__(*args) - vocab_size = len(self.relevant_words) - self._occurrences = np.zeros(vocab_size, dtype='uint32') - self._co_occurrences = sps.lil_matrix((vocab_size, vocab_size), dtype='uint32') + self._occurrences = np.zeros(self._vocab_size, dtype='uint32') + self._co_occurrences = sps.lil_matrix((self._vocab_size, self._vocab_size), dtype='uint32') + + self._uniq_words = np.zeros((self._vocab_size + 1,), dtype=bool) # add 1 for none token + self._mask = self._uniq_words[:-1] # to exclude none token def __str__(self): return self.__class__.__name__ @@ -242,25 +242,23 @@ def partial_accumulate(self, texts, window_size): return self def analyze_text(self, window, doc_num=None): + self.slide_window(window, doc_num) + if self._mask.any(): + self._occurrences[self._mask] += 1 + + for combo in itertools.combinations(np.nonzero(mask)[0], 2): + self._co_occurrences[combo] += 1 + + def slide_window(self, window, doc_num): if doc_num != self._current_doc_num: - self._uniq_words = set(window) - self._uniq_words.discard(None) - self._token_at_edge = window[0] + self._uniq_words[:] = False + self._uniq_words[np.unique(window)] = True self._current_doc_num = doc_num else: - if self._token_at_edge is not None: - self._uniq_words.discard(self._token_at_edge) # may be irrelevant token - self._token_at_edge = window[0] + self._uniq_words[self._token_at_edge] = False + self._uniq_words[window[-1]] = True - if window[-1] is not None: - self._uniq_words.add(window[-1]) - - if self._uniq_words: - words_idx = np.array(list(self._uniq_words)) - self._occurrences[words_idx] += 1 - - for combo in itertools.combinations(words_idx, 2): - self._co_occurrences[combo] += 1 + self._token_at_edge = window[0] def _symmetrize(self): """Word pairs may have been encountered in (i, j) and (j, i) order. @@ -283,15 +281,31 @@ def merge(self, other): self._num_docs += other._num_docs +class PatchedWordOccurrenceAccumulator(WordOccurrenceAccumulator): + """Monkey patched for multiprocessing worker usage, + to move some of the logic to the master process. + """ + def _iter_texts(self, texts): + return texts # master process will handle this + + class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): """Accumulate word occurrences in parallel.""" def __init__(self, processes, *args, **kwargs): + """ + Args: + ---- + processes : number of processes to use; must be at least two. + args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). + kwargs : can include `batch_size`, which is the number of docs to send to a worker at a + time. If not included, it defaults to 32. + """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: - raise ValueError("Must have at least 2 processes to run in parallel; got %d" % processes) + raise ValueError("Must have at least 2 processes to run in parallel; got %d", processes) self.processes = processes - self.batch_size = kwargs.get('batch_size', 16) + self.batch_size = kwargs.get('batch_size', 32) def __str__(self): return "%s(processes=%s, batch_size=%s)" % ( @@ -303,7 +317,8 @@ def accumulate(self, texts, window_size): self.queue_all_texts(input_q, texts, window_size) interrupted = False except KeyboardInterrupt: - logger.warn("stats accumulation interrupted; <= %d documents processed" % self._num_docs) + logger.warn("stats accumulation interrupted; <= %d documents processed", + self._num_docs) interrupted = True accumulators = self.terminate_workers(input_q, output_q, workers, interrupted) @@ -320,7 +335,7 @@ def start_workers(self, window_size): output_q = mp.Queue() workers = [] for _ in range(self.processes): - accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) + accumulator = PatchedWordOccurrenceAccumulator(self.relevant_ids, self.dictionary) worker = AccumulatingWorker(input_q, output_q, accumulator, window_size) worker.start() workers.append(worker) @@ -332,7 +347,7 @@ def yield_batches(self, texts): `batch_size` texts at a time. """ batch = [] - for text in texts: + for text in self._iter_texts(texts): batch.append(text) if len(batch) == self.batch_size: yield batch @@ -345,14 +360,14 @@ def queue_all_texts(self, q, texts, window_size): """Sequentially place batches of texts on the given queue until `texts` is consumed. The texts are filtered so that only those with at least one relevant token are queued. """ - relevant_texts = (text for text in texts if self.text_is_relevant(text)) - for batch_num, batch in enumerate(self.yield_batches(relevant_texts)): + for batch_num, batch in enumerate(self.yield_batches(texts)): q.put(batch, block=True) before = self._num_docs / self.log_every self._num_docs += sum(len(doc) - window_size + 1 for doc in batch) if before < (self._num_docs / self.log_every): - logger.info("submitted %d batches to accumulate stats from %d documents (%d virtual)" % ( - batch_num, (batch_num + 1) * self.batch_size, self._num_docs)) + logger.info("%d batches submitted to accumulate stats from %d documents (%d " + "virtual)", + (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) def terminate_workers(self, input_q, output_q, workers, interrupted=False): """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, @@ -392,10 +407,10 @@ def merge_accumulators(self, accumulators): accumulator = accumulators[0] for other_accumulator in accumulators[1:]: accumulator.merge(other_accumulator) - # Workers perform partial accumulation, so none of the co-occurrence matrices are symmetrized. - # This is by design, to avoid unnecessary matrix additions during accumulation. + # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized. + # This is by design, to avoid unnecessary matrix additions/conversions during accumulation. accumulator._symmetrize() - logger.info("accumulated word occurrence stats for %d virtual documents" % + logger.info("accumulated word occurrence stats for %d virtual documents", accumulator.num_docs) return accumulator @@ -414,20 +429,20 @@ def __init__(self, input_q, output_q, accumulator, window_size): def run(self): try: self._run() - print("finished normally") except KeyboardInterrupt: - logger.info("%s interrupted after processing %d documents" % ( - self.__class__.__name__, self.accumulator.num_docs)) + logger.info("%s interrupted after processing %d documents", + self.__class__.__name__, self.accumulator.num_docs) except Exception as e: - logger.error("worker encountered unexpected exception: %s" % e) - logger.error(traceback.format_exc()) + logger.error("worker encountered unexpected exception: %s\n%s", + e, traceback.format_exc()) finally: self.reply_to_master() def _run(self): - batch_num = 0 + batch_num = -1 n_docs = 0 while True: + batch_num += 1 docs = self.input_q.get(block=True) if docs is None: # sentinel value logger.debug("observed sentinel value; terminating") @@ -435,9 +450,11 @@ def _run(self): self.accumulator.partial_accumulate(docs, self.window_size) n_docs += len(docs) - logger.debug("completed batch %d; %d documents processed (%d virtual)" % ( - batch_num, n_docs, self.accumulator.num_docs)) - batch_num += 1 + logger.debug("completed batch %d; %d documents processed (%d virtual)", + batch_num, n_docs, self.accumulator.num_docs) + + logger.debug("finished all batches; %d documents processed (%d virtual)", + n_docs, self.accumulator.num_docs) def reply_to_master(self): logger.info("serializing accumulator to return to master...") From bbd27482f140521f64d1a396f9c2b91168881cb1 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 31 May 2017 14:04:12 -0400 Subject: [PATCH 19/33] #1342: Fix accidental typo. --- gensim/topic_coherence/text_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 81989992d9..2424ad9ce9 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -246,7 +246,7 @@ def analyze_text(self, window, doc_num=None): if self._mask.any(): self._occurrences[self._mask] += 1 - for combo in itertools.combinations(np.nonzero(mask)[0], 2): + for combo in itertools.combinations(np.nonzero(self._mask)[0], 2): self._co_occurrences[combo] += 1 def slide_window(self, window, doc_num): From 5fb0b959039586d366bd1f128108d105aa338550 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 31 May 2017 14:31:26 -0400 Subject: [PATCH 20/33] #1342: Further optimize word co-occurrence accumulation by using a `collections.Counter` instance for accumulation within a batch. --- gensim/topic_coherence/text_analysis.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 2424ad9ce9..371cfd22f5 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -14,6 +14,7 @@ import itertools import traceback import multiprocessing as mp +from collections import Counter import numpy as np import scipy.sparse as sps @@ -93,7 +94,7 @@ def _get_co_occurrences(self, word_id1, word_id2): class UsesDictionary(BaseAnalyzer): """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts. - The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id + The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id mapping. """ @@ -220,6 +221,7 @@ def __init__(self, *args): self._uniq_words = np.zeros((self._vocab_size + 1,), dtype=bool) # add 1 for none token self._mask = self._uniq_words[:-1] # to exclude none token + self._counter = Counter() def __str__(self): return self.__class__.__name__ @@ -238,18 +240,21 @@ def partial_accumulate(self, texts, window_size): """ self._current_doc_num = -1 self._token_at_edge = None + self._counter.clear() + super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) + for combo, count in self._counter.iteritems(): + self._co_occurrences[combo] += count + return self def analyze_text(self, window, doc_num=None): - self.slide_window(window, doc_num) + self._slide_window(window, doc_num) if self._mask.any(): self._occurrences[self._mask] += 1 + self._counter.update(itertools.combinations(np.nonzero(self._mask)[0], 2)) - for combo in itertools.combinations(np.nonzero(self._mask)[0], 2): - self._co_occurrences[combo] += 1 - - def slide_window(self, window, doc_num): + def _slide_window(self, window, doc_num): if doc_num != self._current_doc_num: self._uniq_words[:] = False self._uniq_words[np.unique(window)] = True @@ -298,14 +303,14 @@ def __init__(self, processes, *args, **kwargs): ---- processes : number of processes to use; must be at least two. args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). - kwargs : can include `batch_size`, which is the number of docs to send to a worker at a - time. If not included, it defaults to 32. + kwargs : can include `batch_size`, which is the number of docs to send to a worker at a + time. If not included, it defaults to 64. """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: raise ValueError("Must have at least 2 processes to run in parallel; got %d", processes) self.processes = processes - self.batch_size = kwargs.get('batch_size', 32) + self.batch_size = kwargs.get('batch_size', 64) def __str__(self): return "%s(processes=%s, batch_size=%s)" % ( From 880b8d08d146dc5c3affdb8efaa28c77e077db50 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 1 Jun 2017 10:47:33 -0400 Subject: [PATCH 21/33] #1342: Clean up logging in `text_analysis` module and remove empty line at end of `util` module. --- gensim/topic_coherence/text_analysis.py | 17 ++++++++--------- gensim/utils.py | 1 - 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 371cfd22f5..6062c445b0 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -65,8 +65,8 @@ def num_docs(self): def num_docs(self, num): self._num_docs = num if self._num_docs % self.log_every == 0: - logger.info("%s accumulated stats from %d documents" % ( - self.__class__.__name__, self._num_docs)) + logger.info("%s accumulated stats from %d documents", + self.__class__.__name__, self._num_docs) def analyze_text(self, text, doc_num=None): raise NotImplementedError("Base classes should implement analyze_text.") @@ -370,9 +370,9 @@ def queue_all_texts(self, q, texts, window_size): before = self._num_docs / self.log_every self._num_docs += sum(len(doc) - window_size + 1 for doc in batch) if before < (self._num_docs / self.log_every): - logger.info("%d batches submitted to accumulate stats from %d documents (%d " - "virtual)", - (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) + logger.info( + "%d batches submitted to accumulate stats from %d documents (%d virtual)", + (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) def terminate_workers(self, input_q, output_q, workers, interrupted=False): """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, @@ -394,7 +394,7 @@ def terminate_workers(self, input_q, output_q, workers, interrupted=False): accumulators = [] while len(accumulators) != len(workers): accumulators.append(output_q.get()) - logger.info("%d accumulators retrieved from output queue" % len(accumulators)) + logger.info("%d accumulators retrieved from output queue", len(accumulators)) for worker in workers: if worker.is_alive(): @@ -437,9 +437,8 @@ def run(self): except KeyboardInterrupt: logger.info("%s interrupted after processing %d documents", self.__class__.__name__, self.accumulator.num_docs) - except Exception as e: - logger.error("worker encountered unexpected exception: %s\n%s", - e, traceback.format_exc()) + except: + logger.exception("worker encountered unexpected exception") finally: self.reply_to_master() diff --git a/gensim/utils.py b/gensim/utils.py index 8b57871d5a..dd391f887b 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1259,4 +1259,3 @@ def _iter_windows(document, window_size, copy=False, ignore_below_size=True): else: for doc_window in doc_windows: yield doc_window.copy() if copy else doc_window - From 1d32b8eb8d29f3729a8029e7deacab159d1f03e5 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 1 Jun 2017 11:07:20 -0400 Subject: [PATCH 22/33] #1342: Remove unused traceback module. --- gensim/topic_coherence/text_analysis.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 6062c445b0..6a6cd6aaae 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -12,7 +12,6 @@ import sys import logging import itertools -import traceback import multiprocessing as mp from collections import Counter From 8e04b416cf0c6459dbef041dbc5345ac191a7e3c Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 1 Jun 2017 11:47:07 -0400 Subject: [PATCH 23/33] #1342: Fixes for python3 compatibility. --- gensim/topic_coherence/text_analysis.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 6a6cd6aaae..d73a3f7b8e 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -17,6 +17,7 @@ import numpy as np import scipy.sparse as sps +from six import iteritems from gensim import utils @@ -141,7 +142,7 @@ def _get_co_occurrences(self, word_id1, word_id2): return len(s1.intersection(s2)) def index_to_dict(self): - contiguous2id = {n: word_id for word_id, n in self.id2contiguous.iteritems()} + contiguous2id = {n: word_id for word_id, n in iteritems(self.id2contiguous)} return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} @@ -242,7 +243,7 @@ def partial_accumulate(self, texts, window_size): self._counter.clear() super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) - for combo, count in self._counter.iteritems(): + for combo, count in iteritems(self._counter): self._co_occurrences[combo] += count return self @@ -427,7 +428,7 @@ def __init__(self, input_q, output_q, accumulator, window_size): self.input_q = input_q self.output_q = output_q self.accumulator = accumulator - self.accumulator.log_every = sys.maxint # avoid logging in workers + self.accumulator.log_every = sys.maxsize # avoid logging in workers self.window_size = window_size def run(self): From e3ce40244d8514d4d2311526f7613a2bd689a643 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 1 Jun 2017 13:56:35 -0400 Subject: [PATCH 24/33] #1342: Hopefully `six.viewitems` works for python3 compatibility? --- gensim/topic_coherence/text_analysis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index d73a3f7b8e..7e8e57d703 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -17,7 +17,7 @@ import numpy as np import scipy.sparse as sps -from six import iteritems +from six import viewitems from gensim import utils @@ -142,7 +142,7 @@ def _get_co_occurrences(self, word_id1, word_id2): return len(s1.intersection(s2)) def index_to_dict(self): - contiguous2id = {n: word_id for word_id, n in iteritems(self.id2contiguous)} + contiguous2id = {n: word_id for word_id, n in viewitems(self.id2contiguous)} return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} @@ -243,7 +243,7 @@ def partial_accumulate(self, texts, window_size): self._counter.clear() super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) - for combo, count in iteritems(self._counter): + for combo, count in viewitems(self._counter): self._co_occurrences[combo] += count return self From 7f7f55daf335de54793e63349aff22ce0ce123f2 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 1 Jun 2017 16:19:34 -0400 Subject: [PATCH 25/33] #1342: Realized the python3 compatibility issue was due to the Dictionary mapping to different ids, so fixed the `probability_estimation` tests to be agnostic of this. Also fixed an issue with the interpretation of strings as iterables when getting occurrences of strings in the `text_analysis.BaseAnalyzer.__getitem__` method. --- gensim/test/test_probability_estimation.py | 167 +++++++++------------ gensim/topic_coherence/text_analysis.py | 8 +- 2 files changed, 75 insertions(+), 100 deletions(-) diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py index f87b7bc564..982230a526 100644 --- a/gensim/test/test_probability_estimation.py +++ b/gensim/test/test_probability_estimation.py @@ -16,107 +16,82 @@ from gensim.corpora.dictionary import Dictionary -class ProbabilityEstimationBase(unittest.TestCase): - texts = [['human', 'interface', 'computer'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees']] +class BaseTestCases(object): + class ProbabilityEstimationBase(unittest.TestCase): + texts = [['human', 'interface', 'computer'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees']] + dictionary = None -class TestProbabilityEstimation(ProbabilityEstimationBase): - def setUp(self): - self.dictionary = HashDictionary(self.texts) - # Following is the mapping: - # {'computer': 10608, - # 'eps': 31049, - # 'graph': 18451, - # 'human': 31002, - # 'interface': 12466, - # 'response': 5232, - # 'system': 5798, - # 'time': 29104, - # 'trees': 23844, - # 'user': 12736} - self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] - # Suppose the segmented topics from s_one_pre are: - self.segmented_topics = [ - [ - (5798, 18451), - (10608, 18451), - (10608, 5798) - ], [ - (10608, 18451), - (12736, 18451), - (12736, 10608) + def build_segmented_topics(self): + # Suppose the segmented topics from s_one_pre are: + token2id = self.dictionary.token2id + computer_id = token2id['computer'] + system_id = token2id['system'] + user_id = token2id['user'] + graph_id = token2id['graph'] + self.segmented_topics = [ + [ + (system_id, graph_id), + (computer_id, graph_id), + (computer_id, system_id) + ], [ + (computer_id, graph_id), + (user_id, graph_id), + (user_id, computer_id) + ] ] - ] - - def testPBooleanDocument(self): - """Test p_boolean_document()""" - # Unique topic ids are 5798, 10608, 12736 and 18451 - accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) - obtained = accumulator.index_to_dict() - expected = {18451: {5}, 12736: {1, 3}, 5798: {1, 2}, 10608: {0}} - self.assertEqual(expected, obtained) - - def testPBooleanSlidingWindow(self): - """Test p_boolean_sliding_window()""" - # Test with window size as 2. window_id is zero indexed. - accumulator = probability_estimation.p_boolean_sliding_window( - self.texts, self.segmented_topics, self.dictionary, 2) - self.assertEqual(1, accumulator[10608]) - self.assertEqual(3, accumulator[12736]) - self.assertEqual(1, accumulator[18451]) - self.assertEqual(4, accumulator[5798]) - - -class TestProbabilityEstimationWithNormalDictionary(ProbabilityEstimationBase): - def setUp(self): + + self.computer_id = computer_id + self.system_id = system_id + self.user_id = user_id + self.graph_id = graph_id + + def setup_dictionary(self): + raise NotImplementedError + + def setUp(self): + self.setup_dictionary() + self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] + self.build_segmented_topics() + + def testPBooleanDocument(self): + """Test p_boolean_document()""" + accumulator = probability_estimation.p_boolean_document( + self.corpus, self.segmented_topics) + obtained = accumulator.index_to_dict() + expected = { + self.graph_id: {5}, + self.user_id: {1, 3}, + self.system_id: {1, 2}, + self.computer_id: {0} + } + self.assertEqual(expected, obtained) + + def testPBooleanSlidingWindow(self): + """Test p_boolean_sliding_window()""" + # Test with window size as 2. window_id is zero indexed. + accumulator = probability_estimation.p_boolean_sliding_window( + self.texts, self.segmented_topics, self.dictionary, 2) + self.assertEqual(1, accumulator[self.computer_id]) + self.assertEqual(3, accumulator[self.user_id]) + self.assertEqual(1, accumulator[self.graph_id]) + self.assertEqual(4, accumulator[self.system_id]) + + +class TestProbabilityEstimation(BaseTestCases.ProbabilityEstimationBase): + def setup_dictionary(self): + self.dictionary = HashDictionary(self.texts) + + +class TestProbabilityEstimationWithNormalDictionary(BaseTestCases.ProbabilityEstimationBase): + def setup_dictionary(self): self.dictionary = Dictionary(self.texts) self.dictionary.id2token = {v: k for k, v in self.dictionary.token2id.items()} - # Following is the mapping: - # {u'computer': 1, - # u'eps': 5, - # u'graph': 9, - # u'human': 2, - # u'interface': 0, - # u'response': 6, - # u'system': 4, - # u'time': 7, - # u'trees': 8, - # u'user': 3} - self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] - # Suppose the segmented topics from s_one_pre are: - self.segmented_topics = [ - [ - (4, 9), - (1, 9), - (1, 4) - ], [ - (1, 9), - (3, 9), - (3, 1) - ] - ] - - def testPBooleanDocument(self): - """Test p_boolean_document()""" - accumulator = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) - obtained = accumulator.index_to_dict() - expected = {9: {5}, 3: {1, 3}, 4: {1, 2}, 1: {0}} - self.assertEqual(expected, obtained) - - def testPBooleanSlidingWindow(self): - """Test p_boolean_sliding_window()""" - # Test with window size as 2. window_id is zero indexed. - accumulator = probability_estimation.p_boolean_sliding_window( - self.texts, self.segmented_topics, self.dictionary, 2) - self.assertEqual(1, accumulator[1]) - self.assertEqual(3, accumulator[3]) - self.assertEqual(1, accumulator[9]) - self.assertEqual(4, accumulator[4]) if __name__ == '__main__': diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 7e8e57d703..8cdf1027fd 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -17,7 +17,7 @@ import numpy as np import scipy.sparse as sps -from six import viewitems +from six import viewitems, string_types from gensim import utils @@ -72,10 +72,10 @@ def analyze_text(self, text, doc_num=None): raise NotImplementedError("Base classes should implement analyze_text.") def __getitem__(self, word_or_words): - if hasattr(word_or_words, '__iter__'): - return self.get_co_occurrences(*word_or_words) - else: + if isinstance(word_or_words, string_types) or not hasattr(word_or_words, '__iter__'): return self.get_occurrences(word_or_words) + else: + return self.get_co_occurrences(*word_or_words) def get_occurrences(self, word_id): """Return number of docs the word occurs in, once `accumulate` has been called.""" From 343da69f0c49e71131f71910ef5d70250d73285c Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Fri, 2 Jun 2017 08:47:04 -0400 Subject: [PATCH 26/33] #1342: Fixed a few bugs and added test coverage for the coherencemodel accumulator caching; made model a property with a setter that also sets the topics and uncaches the accumulator if the model's topics have ids not tracked by the accumulator. --- gensim/models/coherencemodel.py | 74 +++++++------ gensim/test/test_coherencemodel.py | 166 +++++++++++++++-------------- 2 files changed, 131 insertions(+), 109 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index adcac0f27a..e53d5600ca 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -38,28 +38,28 @@ boolean_document_based = {'u_mass'} sliding_window_based = {'c_v', 'c_uci', 'c_npmi'} -make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') - -coherence_dict = { - 'u_mass': make_pipeline(segmentation.s_one_pre, - probability_estimation.p_boolean_document, - direct_confirmation_measure.log_conditional_probability, - aggregation.arithmetic_mean), - 'c_v': make_pipeline(segmentation.s_one_set, - probability_estimation.p_boolean_sliding_window, - indirect_confirmation_measure.cosine_similarity, - aggregation.arithmetic_mean), - 'c_uci': make_pipeline(segmentation.s_one_one, - probability_estimation.p_boolean_sliding_window, - direct_confirmation_measure.log_ratio_measure, - aggregation.arithmetic_mean), - 'c_npmi': make_pipeline(segmentation.s_one_one, +_make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') + +COHERENCE_MEASURES = { + 'u_mass': _make_pipeline(segmentation.s_one_pre, + probability_estimation.p_boolean_document, + direct_confirmation_measure.log_conditional_probability, + aggregation.arithmetic_mean), + 'c_v': _make_pipeline(segmentation.s_one_set, + probability_estimation.p_boolean_sliding_window, + indirect_confirmation_measure.cosine_similarity, + aggregation.arithmetic_mean), + 'c_uci': _make_pipeline(segmentation.s_one_one, probability_estimation.p_boolean_sliding_window, direct_confirmation_measure.log_ratio_measure, aggregation.arithmetic_mean), + 'c_npmi': _make_pipeline(segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), } -sliding_windows_dict = { +SLIDING_WINDOW_SIZES = { 'c_v': 110, 'c_uci': 10, 'c_npmi': 10 @@ -174,7 +174,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= elif coherence in sliding_window_based: self.window_size = window_size if self.window_size is None: - self.window_size = sliding_windows_dict[self.coherence] + self.window_size = SLIDING_WINDOW_SIZES[self.coherence] if texts is None: raise ValueError("'texts' should be provided for %s coherence." % coherence) else: @@ -183,8 +183,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= raise ValueError("%s coherence is not currently supported." % coherence) self.topn = topn - self.model = model - + self._model = model self._accumulator = None self._topics = None self.topics = topics @@ -194,9 +193,21 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= def __str__(self): return str(self.measure) + @property + def model(self): + return self._model + + @model.setter + def model(self, model): + self._model = model + if model is not None: + new_topics = self._get_topics() + self._update_accumulator(new_topics) + self._topics = new_topics + @property def measure(self): - return coherence_dict[self.coherence] + return COHERENCE_MEASURES[self.coherence] @property def topics(self): @@ -208,33 +219,34 @@ def topics(self, topics): if self.model is not None: new_topics = self._get_topics() if topics is not None: - logger.warn("Ignoring topics you are attempting to set in favor of model's topics: %s" % self.model) + logger.warning( + "Ignoring topics you are attempting to set in favor of model's topics: %s", + self.model) elif topics is not None: new_topics = [] for topic in topics: topic_token_ids = np.array([self.dictionary.token2id[token] for token in topic]) new_topics.append(topic_token_ids) + self._update_accumulator(new_topics) + self._topics = new_topics + + def _update_accumulator(self, new_topics): if self._relevant_ids_will_differ(new_topics): logger.debug("Wiping cached accumulator since it does not contain all relevant ids.") self._accumulator = None - self._topics = new_topics - def _relevant_ids_will_differ(self, new_topics): - if not self._topics_differ(new_topics): + if self._accumulator is None or not self._topics_differ(new_topics): return False - measure = self.measure - current_set = unique_ids_from_segments(measure.seg(self.topics)) - new_set = unique_ids_from_segments(measure.seg(new_topics)) - return not current_set.issuperset(new_set) + new_set = unique_ids_from_segments(self.measure.seg(new_topics)) + return not self._accumulator.relevant_ids.issuperset(new_set) def _topics_differ(self, new_topics): return (new_topics is not None and self._topics is not None and - self._accumulator is not None and - not np.equal(new_topics, self._topics).all()) + not np.array_equal(new_topics, self._topics)) def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 679f115f5b..4827b6ba1e 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -8,33 +8,19 @@ Automated tests for checking transformation algorithms (the models package). """ +import os import logging import unittest -import os -import os.path import tempfile +import numpy as np + from gensim.models.coherencemodel import CoherenceModel, boolean_document_based from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaMallet from gensim.models.wrappers import LdaVowpalWabbit from gensim.corpora.dictionary import Dictionary - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] +from gensim.matutils import argsort def testfile(): @@ -43,6 +29,23 @@ def testfile(): class TestCoherenceModel(unittest.TestCase): + + # set up vars used in testing ("Deerwester" from the web tutorial) + texts = [['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] + dictionary = Dictionary(texts) + + @classmethod + def setUpClass(cls): + cls.corpus = [cls.dictionary.doc2bow(text) for text in cls.texts] + def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. # `topics1` is clearly better as it has a clear distinction between system-human @@ -52,28 +55,31 @@ def setUp(self): ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] - self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0) + self.ldamodel = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=2, + passes=0, iterations=0) mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: - self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0) + self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=self.corpus, + id2word=self.dictionary, num_topics=2, iterations=0) vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: - msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" - logging.info(msg) + logging.info("Environment variable 'VOWPAL_WABBIT_PATH' not specified," + " skipping sanity checks for LDA Model") self.vw_path = None else: self.vw_path = vw_path - self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0) + self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=self.corpus, + id2word=self.dictionary, num_topics=2, passes=0) def check_coherence_measure(self, coherence): """Check provided topic coherence algorithm on given topics""" if coherence in boolean_document_based: - kwargs = dict(corpus=corpus, dictionary=dictionary, coherence=coherence) + kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence) cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm2 = CoherenceModel(topics=self.topics2, **kwargs) else: - kwargs = dict(texts=texts, dictionary=dictionary, coherence=coherence) + kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence) cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm2 = CoherenceModel(topics=self.topics2, **kwargs) self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) @@ -99,127 +105,131 @@ def testUMassLdaModel(self): # Note that this is just a sanity check because LDA does not guarantee a better coherence # value on the topics if iterations are increased. This can be seen here: # https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde - try: - cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass') - except: - raise + CoherenceModel(model=self.ldamodel, corpus=self.corpus, coherence='u_mass') def testCvLdaModel(self): """Perform sanity check to see if c_v coherence works with LDA Model""" - try: - cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_v') - except: - raise + CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_v') def testCuciLdaModel(self): """Perform sanity check to see if c_uci coherence works with LDA Model""" - try: - cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_uci') - except: - raise + CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_uci') def testCnpmiLdaModel(self): """Perform sanity check to see if c_npmi coherence works with LDA Model""" - try: - cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_npmi') - except: - raise + CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_npmi') def testUMassMalletModel(self): """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper""" if not self.mallet_path: return - try: - cm = CoherenceModel(model=self.malletmodel, corpus=corpus, coherence='u_mass') - except: - raise + CoherenceModel(model=self.malletmodel, corpus=self.corpus, coherence='u_mass') def testCvMalletModel(self): """Perform sanity check to see if c_v coherence works with LDA Mallet gensim wrapper""" if not self.mallet_path: return - try: - cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_v') - except: - raise + CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_v') def testCuciMalletModel(self): """Perform sanity check to see if c_uci coherence works with LDA Mallet gensim wrapper""" if not self.mallet_path: return - try: - cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_uci') - except: - raise + CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_uci') def testCnpmiMalletModel(self): """Perform sanity check to see if c_npmi coherence works with LDA Mallet gensim wrapper""" if not self.mallet_path: return - try: - cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_npmi') - except: - raise + CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_npmi') def testUMassVWModel(self): """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper""" if not self.vw_path: return - try: - cm = CoherenceModel(model=self.vwmodel, corpus=corpus, coherence='u_mass') - except: - raise + CoherenceModel(model=self.vwmodel, corpus=self.corpus, coherence='u_mass') def testCvVWModel(self): """Perform sanity check to see if c_v coherence works with LDA VW gensim wrapper""" if not self.vw_path: return - try: - cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_v') - except: - raise + CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_v') def testCuciVWModel(self): """Perform sanity check to see if c_uci coherence works with LDA VW gensim wrapper""" if not self.vw_path: return - try: - cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_uci') - except: - raise + CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_uci') def testCnpmiVWModel(self): """Perform sanity check to see if c_npmi coherence works with LDA VW gensim wrapper""" if not self.vw_path: return - try: - cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_npmi') - except: - raise + CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_npmi') def testErrors(self): """Test if errors are raised on bad input""" # not providing dictionary - self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, coherence='u_mass') + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, + coherence='u_mass') # not providing texts for c_v and instead providing corpus - self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='c_v') + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, + dictionary=self.dictionary, coherence='c_v') # not providing corpus or texts for u_mass - self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass') + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, + dictionary=self.dictionary, coherence='u_mass') def testPersistence(self): fname = testfile() - model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, + coherence='u_mass') model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) def testPersistenceCompressed(self): fname = testfile() + '.gz' - model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, + coherence='u_mass') model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) + def testAccumulatorCachingSameSizeTopics(self): + kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') + cm1 = CoherenceModel(topics=self.topics1, **kwargs) + cm1.estimate_probabilities() + accumulator = cm1._accumulator + self.assertIsNotNone(accumulator) + cm1.topics = self.topics1 + self.assertEqual(accumulator, cm1._accumulator) + cm1.topics = self.topics2 + self.assertEqual(None, cm1._accumulator) + + def testAccumulatorCachingTopicSubsets(self): + kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') + cm1 = CoherenceModel(topics=self.topics1, **kwargs) + cm1.estimate_probabilities() + accumulator = cm1._accumulator + self.assertIsNotNone(accumulator) + cm1.topics = [t[:2] for t in self.topics1] + self.assertEqual(accumulator, cm1._accumulator) + cm1.topics = self.topics1 + self.assertEqual(accumulator, cm1._accumulator) + + def testAccumulatorCachingWithModelSetting(self): + kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') + cm1 = CoherenceModel(topics=self.topics1, **kwargs) + cm1.estimate_probabilities() + self.assertIsNotNone(cm1._accumulator) + cm1.model = self.ldamodel + topics = [] + for topic in self.ldamodel.state.get_lambda(): + bestn = argsort(topic, topn=cm1.topn, reverse=True) + topics.append(bestn) + self.assertTrue(np.array_equal(topics, cm1.topics)) + self.assertIsNone(cm1._accumulator) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From 1ce8a720629e8c920ad27e992f9edc59efa24aae Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Fri, 2 Jun 2017 09:24:11 -0400 Subject: [PATCH 27/33] #1342: Further tests for persistence of accumulator. --- gensim/test/test_coherencemodel.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 4827b6ba1e..426a6ef71c 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -195,6 +195,26 @@ def testPersistenceCompressed(self): model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) + def testPersistenceAfterProbabilityEstimationUsingCorpus(self): + fname = testfile() + model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, + coherence='u_mass') + model.estimate_probabilities() + model.save(fname) + model2 = CoherenceModel.load(fname) + self.assertIsNotNone(model2._accumulator) + self.assertTrue(model.get_coherence() == model2.get_coherence()) + + def testPersistenceAfterProbabilityEstimationUsingTexts(self): + fname = testfile() + model = CoherenceModel(topics=self.topics1, texts=self.texts, dictionary=self.dictionary, + coherence='c_v') + model.estimate_probabilities() + model.save(fname) + model2 = CoherenceModel.load(fname) + self.assertIsNotNone(model2._accumulator) + self.assertTrue(model.get_coherence() == model2.get_coherence()) + def testAccumulatorCachingSameSizeTopics(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') cm1 = CoherenceModel(topics=self.topics1, **kwargs) From 96fd3433ec124b0be0462e14309dfd27c4b580f1 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Sun, 4 Jun 2017 15:52:08 -0400 Subject: [PATCH 28/33] #1342: Add test case for `CorpusAccumulator`. --- gensim/test/test_text_analysis.py | 30 ++++++++++++++++++++++++- gensim/topic_coherence/text_analysis.py | 5 ++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index ed6d482b44..c32e6b2ebd 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -2,7 +2,8 @@ import unittest from gensim.topic_coherence.text_analysis import \ - InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator + InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \ + CorpusAccumulator from gensim.corpora.dictionary import Dictionary @@ -145,6 +146,33 @@ def init_accumulator2(self): return self.accumulator_cls(2, self.top_ids2, self.dictionary2) +class TestCorpusAnalyzer(unittest.TestCase): + + def setUp(self): + self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary + self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids + self.corpus = [self.dictionary.doc2bow(doc) + for doc in BaseTestCases.TextAnalyzerTestBase.texts] + + def test_index_accumulation(self): + accumulator = CorpusAccumulator(self.top_ids)\ + .accumulate(self.corpus) + inverted_index = accumulator.index_to_dict() + expected = { + 10: {0, 2, 3}, + 15: {0}, + 20: {0}, + 21: {1, 2, 3}, + 17: {1, 2} + } + self.assertDictEqual(expected, inverted_index) + + self.assertEqual(3, accumulator.get_occurrences(10)) + self.assertEqual(2, accumulator.get_occurrences(17)) + self.assertEqual(2, accumulator.get_co_occurrences(10, 21)) + self.assertEqual(1, accumulator.get_co_occurrences(10, 17)) + + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 8cdf1027fd..90d7d83467 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -152,9 +152,8 @@ class CorpusAccumulator(InvertedIndexBased): def analyze_text(self, text, doc_num=None): doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) - if len(top_ids_in_doc) > 0: - for word_id in top_ids_in_doc: - self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs) + for word_id in top_ids_in_doc: + self._inverted_index[self.id2contiguous[word_id]].add(self._num_docs) def accumulate(self, corpus): for document in corpus: From a631ab69df0b3e553d6b88f8c5ecfa1e92bcb52f Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Mon, 5 Jun 2017 11:08:56 -0400 Subject: [PATCH 29/33] #1342: Formatting fixes for hanging indents and overly long lines. --- gensim/corpora/wikicorpus.py | 23 ++++--- gensim/models/coherencemodel.py | 67 +++++++++++-------- gensim/test/test_text_analysis.py | 42 ++++++------ .../direct_confirmation_measure.py | 7 +- .../indirect_confirmation_measure.py | 31 +++++---- .../topic_coherence/probability_estimation.py | 4 +- gensim/topic_coherence/text_analysis.py | 42 +++++++----- 7 files changed, 121 insertions(+), 95 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 13b111db4f..ec032067f1 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -20,13 +20,13 @@ import bz2 import logging -import re -from xml.etree.cElementTree import iterparse # LXML isn't faster, so let's go with the built-in solution import multiprocessing +import re import signal +from xml.etree.cElementTree import \ + iterparse # LXML isn't faster, so let's go with the built-in solution from gensim import utils - # cannot import whole gensim.corpora, because that imports wikicorpus... from gensim.corpora.dictionary import Dictionary from gensim.corpora.textcorpus import TextCorpus @@ -266,7 +266,8 @@ class WikiCorpus(TextCorpus): >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word """ - def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): + def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, + filter_namespaces=('0',)): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. @@ -305,9 +306,10 @@ def get_texts(self): """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 - texts = ((text, self.lemmatize, title, pageid) - for title, text, pageid - in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) + texts = \ + ((text, self.lemmatize, title, pageid) + for title, text, pageid + in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: @@ -327,9 +329,10 @@ def get_texts(self): else: yield tokens except KeyboardInterrupt: - logger.warn("user terminated iteration over Wikipedia corpus after %i documents with %i positions" - " (total %i articles, %i positions before pruning articles shorter than %i words)", - articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) + logger.warn( + "user terminated iteration over Wikipedia corpus after %i documents with %i positions" + " (total %i articles, %i positions before pruning articles shorter than %i words)", + articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index e53d5600ca..d35a266a4a 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -19,8 +19,8 @@ """ import logging -from collections import namedtuple import multiprocessing as mp +from collections import namedtuple import numpy as np @@ -41,22 +41,30 @@ _make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') COHERENCE_MEASURES = { - 'u_mass': _make_pipeline(segmentation.s_one_pre, - probability_estimation.p_boolean_document, - direct_confirmation_measure.log_conditional_probability, - aggregation.arithmetic_mean), - 'c_v': _make_pipeline(segmentation.s_one_set, - probability_estimation.p_boolean_sliding_window, - indirect_confirmation_measure.cosine_similarity, - aggregation.arithmetic_mean), - 'c_uci': _make_pipeline(segmentation.s_one_one, - probability_estimation.p_boolean_sliding_window, - direct_confirmation_measure.log_ratio_measure, - aggregation.arithmetic_mean), - 'c_npmi': _make_pipeline(segmentation.s_one_one, - probability_estimation.p_boolean_sliding_window, - direct_confirmation_measure.log_ratio_measure, - aggregation.arithmetic_mean), + 'u_mass': _make_pipeline( + segmentation.s_one_pre, + probability_estimation.p_boolean_document, + direct_confirmation_measure.log_conditional_probability, + aggregation.arithmetic_mean + ), + 'c_v': _make_pipeline( + segmentation.s_one_set, + probability_estimation.p_boolean_sliding_window, + indirect_confirmation_measure.cosine_similarity, + aggregation.arithmetic_mean + ), + 'c_uci': _make_pipeline( + segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean + ), + 'c_npmi': _make_pipeline( + segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean + ), } SLIDING_WINDOW_SIZES = { @@ -102,8 +110,8 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, - coherence='c_v', topn=10, processes=-1): + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, + window_size=None, coherence='c_v', topn=10, processes=-1): """ Args: ---- @@ -152,8 +160,9 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= # Check if associated dictionary is provided. if dictionary is None: if isinstance(model.id2word, FakeDict): - raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" - " should be set as the associated dictionary.") + raise ValueError( + "The associated dictionary should be provided with the corpus or 'id2word'" + " for topic model should be set as the associated dictionary.") else: self.dictionary = model.id2word else: @@ -168,7 +177,9 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.texts = texts self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] else: - raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) + raise ValueError( + "Either 'corpus' with 'dictionary' or 'texts' should " + "be provided for %s coherence.", coherence) # Check for correct inputs for c_v coherence measure. elif coherence in sliding_window_based: @@ -176,11 +187,11 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= if self.window_size is None: self.window_size = SLIDING_WINDOW_SIZES[self.coherence] if texts is None: - raise ValueError("'texts' should be provided for %s coherence." % coherence) + raise ValueError("'texts' should be provided for %s coherence.", coherence) else: self.texts = texts else: - raise ValueError("%s coherence is not currently supported." % coherence) + raise ValueError("%s coherence is not currently supported.", coherence) self.topn = topn self._model = model @@ -245,8 +256,8 @@ def _relevant_ids_will_differ(self, new_topics): def _topics_differ(self, new_topics): return (new_topics is not None and - self._topics is not None and - not np.array_equal(new_topics, self._topics)) + self._topics is not None and + not np.array_equal(new_topics, self._topics)) def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" @@ -264,8 +275,8 @@ def _get_topics(self): bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) else: - raise ValueError("This topic model is not currently supported. Supported topic models are" - "LdaModel, LdaVowpalWabbit and LdaMallet.") + raise ValueError("This topic model is not currently supported. Supported topic models " + " are LdaModel, LdaVowpalWabbit and LdaMallet.") return topics def segment_topics(self): diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index c32e6b2ebd..e6f4aba86b 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -1,10 +1,10 @@ import logging import unittest +from gensim.corpora.dictionary import Dictionary from gensim.topic_coherence.text_analysis import \ InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, \ CorpusAccumulator -from gensim.corpora.dictionary import Dictionary class BaseTestCases(object): @@ -28,16 +28,18 @@ class TextAnalyzerTestBase(unittest.TestCase): dictionary.id2token = {v: k for k, v in token2id.items()} top_ids = set(token2id.values()) - texts2 = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], - ['user', 'user']] + texts2 = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ['user', 'user'] + ] dictionary2 = Dictionary(texts2) dictionary2.id2token = {v: k for k, v in dictionary2.token2id.items()} top_ids2 = set(dictionary2.token2id.values()) @@ -51,8 +53,7 @@ def init_accumulator2(self): return self.accumulator_cls(self.top_ids2, self.dictionary2) def test_occurrence_counting(self): - accumulator = self.init_accumulator()\ - .accumulate(self.texts, 3) + accumulator = self.init_accumulator().accumulate(self.texts, 3) self.assertEqual(3, accumulator.get_occurrences("this")) self.assertEqual(1, accumulator.get_occurrences("is")) self.assertEqual(1, accumulator.get_occurrences("a")) @@ -62,8 +63,7 @@ def test_occurrence_counting(self): self.assertEqual(1, accumulator.get_co_occurrences("is", "a")) def test_occurrence_counting2(self): - accumulator = self.init_accumulator2()\ - .accumulate(self.texts2, 110) + accumulator = self.init_accumulator2().accumulate(self.texts2, 110) self.assertEqual(2, accumulator.get_occurrences("human")) self.assertEqual(4, accumulator.get_occurrences("user")) self.assertEqual(3, accumulator.get_occurrences("graph")) @@ -90,8 +90,7 @@ def test_occurrence_counting2(self): self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1)) def test_occurences_for_irrelevant_words(self): - accumulator = self.init_accumulator() \ - .accumulate(self.texts, 2) + accumulator = self.init_accumulator().accumulate(self.texts, 2) with self.assertRaises(KeyError): accumulator.get_occurrences("irrelevant") with self.assertRaises(KeyError): @@ -117,7 +116,7 @@ def test_accumulate1(self): self.assertDictEqual(expected, inverted_index) def test_accumulate2(self): - accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary) \ + accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\ .accumulate(self.texts, 3) # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document'], # ['test', 'test', 'this'] @@ -151,12 +150,11 @@ class TestCorpusAnalyzer(unittest.TestCase): def setUp(self): self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids - self.corpus = [self.dictionary.doc2bow(doc) - for doc in BaseTestCases.TextAnalyzerTestBase.texts] + self.corpus = \ + [self.dictionary.doc2bow(doc) for doc in BaseTestCases.TextAnalyzerTestBase.texts] def test_index_accumulation(self): - accumulator = CorpusAccumulator(self.top_ids)\ - .accumulate(self.corpus) + accumulator = CorpusAccumulator(self.top_ids).accumulate(self.corpus) inverted_index = accumulator.index_to_dict() expected = { 10: {0, 2, 3}, diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 60631375ef..29f68ad56e 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -9,6 +9,7 @@ """ import logging + import numpy as np logger = logging.getLogger(__name__) @@ -24,7 +25,8 @@ def log_conditional_probability(segmented_topics, accumulator): Args: ---- - segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + segmented_topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. accumulator: word occurrence accumulator from probability_estimation. Returns: @@ -62,7 +64,8 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False): Args: ---- - segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + segmented topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. accumulator: word occurrence accumulator from probability_estimation. Returns: diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index eccfb0a3b5..8321656067 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -5,12 +5,13 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -This module contains functions to compute confirmation on a pair of words or word subsets. The advantage of indirect -confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words. -Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are -seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road” -or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation -measures may capture semantic support that direct measures would miss. +This module contains functions to compute confirmation on a pair of words or word subsets. +The advantage of indirect confirmation measure is that it computes similarity of words in W' and +W* with respect to direct confirmations to all words. Eg. Suppose x and z are both competing +brands of cars, which semantically support each other. However, both brands are seldom mentioned +together in documents in the reference corpus. But their confirmations to other words like “road” +or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. +Thus, indirect confirmation measures may capture semantic support that direct measures would miss. The formula used to compute indirect confirmation measure is: @@ -23,11 +24,11 @@ Here 'm' is the direct confirmation measure used. """ -import logging import itertools +import logging -import scipy.sparse as sps import numpy as np +import scipy.sparse as sps from gensim.topic_coherence import direct_confirmation_measure @@ -48,10 +49,13 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm Args: ---- - segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). + segmented_topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. + accumulator : Output from the probability_estimation module. + Is an accumulator of word occurrences (see text_analysis module). topics : Topics obtained from the trained topic model. - measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + measure : String. Direct confirmation measure to be used. + Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors; default is 1. Returns: @@ -78,13 +82,14 @@ def __init__(self, measure, topics, accumulator, gamma): if measure == 'nlr': self.similarity = _pair_npmi else: - raise ValueError("The direct confirmation measure you entered is not currently supported.") + raise ValueError( + "The direct confirmation measure you entered is not currently supported.") self.mapping = _map_to_contiguous(topics) self.vocab_size = len(self.mapping) self.accumulator = accumulator self.gamma = gamma - self.sim_cache = {} # Cache similarities between tokens represented as pairs of word ids, e.g. (1, 2) + self.sim_cache = {} # Cache similarities between tokens (pairs of word ids), e.g. (1, 2) self.context_vector_cache = {} # mapping from (segment, topic_words) --> context_vector def __getitem__(self, idx): diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index fb583b99fc..0c62d68985 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -8,8 +8,8 @@ This module contains functions to perform segmentation on a list of topics. """ -import logging import itertools +import logging from gensim.topic_coherence.text_analysis import \ CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator @@ -60,7 +60,7 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p accumulator = WordOccurrenceAccumulator(top_ids, dictionary) else: accumulator = ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary) - logger.info("using %s to estimate probabilities from sliding windows" % accumulator) + logger.info("using %s to estimate probabilities from sliding windows", accumulator) return accumulator.accumulate(texts, window_size) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 90d7d83467..1b21334178 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -9,10 +9,10 @@ statistical information about word occurrences. """ -import sys -import logging import itertools +import logging import multiprocessing as mp +import sys from collections import Counter import numpy as np @@ -65,8 +65,9 @@ def num_docs(self): def num_docs(self, num): self._num_docs = num if self._num_docs % self.log_every == 0: - logger.info("%s accumulated stats from %d documents", - self.__class__.__name__, self._num_docs) + logger.info( + "%s accumulated stats from %d documents", + self.__class__.__name__, self._num_docs) def analyze_text(self, text, doc_num=None): raise NotImplementedError("Base classes should implement analyze_text.") @@ -143,7 +144,7 @@ def _get_co_occurrences(self, word_id1, word_id2): def index_to_dict(self): contiguous2id = {n: word_id for word_id, n in viewitems(self.id2contiguous)} - return {contiguous2id[n]: doc_id_list for n, doc_id_list in enumerate(self._inverted_index)} + return {contiguous2id[n]: doc_id_set for n, doc_id_set in enumerate(self._inverted_index)} class CorpusAccumulator(InvertedIndexBased): @@ -177,8 +178,9 @@ def __init__(self, relevant_ids, dictionary): def accumulate(self, texts, window_size): relevant_texts = self._iter_texts(texts) - windows = utils.iter_windows(relevant_texts, window_size, ignore_below_size=False, - include_doc_num=True) + windows = utils.iter_windows( + relevant_texts, window_size, ignore_below_size=False, include_doc_num=True) + for doc_num, virtual_document in windows: self.analyze_text(virtual_document, doc_num) self.num_docs += 1 @@ -307,7 +309,8 @@ def __init__(self, processes, *args, **kwargs): """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: - raise ValueError("Must have at least 2 processes to run in parallel; got %d", processes) + raise ValueError( + "Must have at least 2 processes to run in parallel; got %d" % processes) self.processes = processes self.batch_size = kwargs.get('batch_size', 64) @@ -321,8 +324,7 @@ def accumulate(self, texts, window_size): self.queue_all_texts(input_q, texts, window_size) interrupted = False except KeyboardInterrupt: - logger.warn("stats accumulation interrupted; <= %d documents processed", - self._num_docs) + logger.warn("stats accumulation interrupted; <= %d documents processed", self._num_docs) interrupted = True accumulators = self.terminate_workers(input_q, output_q, workers, interrupted) @@ -414,8 +416,9 @@ def merge_accumulators(self, accumulators): # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized. # This is by design, to avoid unnecessary matrix additions/conversions during accumulation. accumulator._symmetrize() - logger.info("accumulated word occurrence stats for %d virtual documents", - accumulator.num_docs) + logger.info( + "accumulated word occurrence stats for %d virtual documents", + accumulator.num_docs) return accumulator @@ -434,8 +437,9 @@ def run(self): try: self._run() except KeyboardInterrupt: - logger.info("%s interrupted after processing %d documents", - self.__class__.__name__, self.accumulator.num_docs) + logger.info( + "%s interrupted after processing %d documents", + self.__class__.__name__, self.accumulator.num_docs) except: logger.exception("worker encountered unexpected exception") finally: @@ -453,11 +457,13 @@ def _run(self): self.accumulator.partial_accumulate(docs, self.window_size) n_docs += len(docs) - logger.debug("completed batch %d; %d documents processed (%d virtual)", - batch_num, n_docs, self.accumulator.num_docs) + logger.debug( + "completed batch %d; %d documents processed (%d virtual)", + batch_num, n_docs, self.accumulator.num_docs) - logger.debug("finished all batches; %d documents processed (%d virtual)", - n_docs, self.accumulator.num_docs) + logger.debug( + "finished all batches; %d documents processed (%d virtual)", + n_docs, self.accumulator.num_docs) def reply_to_master(self): logger.info("serializing accumulator to return to master...") From 5f58bdae633a003a1157655b275672dcde1a61f0 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Tue, 6 Jun 2017 14:04:21 -0400 Subject: [PATCH 30/33] #1342: Fix `indirect_confirmation_measure.cosine_similarity` to return individual topic coherence values, then average those. Make the `ParallelWordOccurrenceAccumulator` return a `WordOccurrenceAccumulator` after accumulation, so it can be trained further afterwards if desired. --- gensim/test/test_coherencemodel.py | 15 +++++++-------- gensim/test/test_indirect_confirmation.py | 11 +++++------ .../indirect_confirmation_measure.py | 6 ++++-- gensim/topic_coherence/text_analysis.py | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 426a6ef71c..d055523dff 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -8,19 +8,19 @@ Automated tests for checking transformation algorithms (the models package). """ -import os import logging -import unittest +import os import tempfile +import unittest import numpy as np +from gensim.corpora.dictionary import Dictionary +from gensim.matutils import argsort from gensim.models.coherencemodel import CoherenceModel, boolean_document_based from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaMallet from gensim.models.wrappers import LdaVowpalWabbit -from gensim.corpora.dictionary import Dictionary -from gensim.matutils import argsort def testfile(): @@ -76,12 +76,11 @@ def check_coherence_measure(self, coherence): """Check provided topic coherence algorithm on given topics""" if coherence in boolean_document_based: kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence) - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm2 = CoherenceModel(topics=self.topics2, **kwargs) else: kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence) - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm2 = CoherenceModel(topics=self.topics2, **kwargs) + + cm1 = CoherenceModel(topics=self.topics1, **kwargs) + cm2 = CoherenceModel(topics=self.topics2, **kwargs) self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) def testUMass(self): diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py index aedd9eaa9a..e78d32ac58 100644 --- a/gensim/test/test_indirect_confirmation.py +++ b/gensim/test/test_indirect_confirmation.py @@ -11,11 +11,11 @@ import logging import unittest +import numpy as np + +from gensim.corpora.dictionary import Dictionary from gensim.topic_coherence import indirect_confirmation_measure from gensim.topic_coherence import text_analysis -from gensim.corpora.dictionary import Dictionary - -import numpy as np class TestIndirectConfirmation(unittest.TestCase): @@ -46,9 +46,8 @@ def testCosineSimilarity(self): # 4. Calculate nlr(1, 1) + nlr(2, 1). Calculate nlr(1, 2), nlr(2, 2). This is our second vector. # 5. Find out cosine similarity between these two vectors. # 6. Similarly for the second segmentation. - expected = [0.6230, 0.6230] # To account for EPSILON approximation - for i in range(len(expected)): - self.assertAlmostEqual(obtained[i], expected[i], 4) + expected = (0.6230 + 0.6230) / 2. # To account for EPSILON approximation + self.assertAlmostEqual(expected, obtained[0], 4) if __name__ == '__main__': diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 8321656067..a05676ab61 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -67,10 +67,12 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm s_cos_sim = [] for topic_words, topic_segments in zip(topics, segmented_topics): topic_words = tuple(topic_words) # because tuples are hashable - for w_prime, w_star in topic_segments: + segment_sims = np.zeros(len(topic_segments)) + for i, (w_prime, w_star) in enumerate(topic_segments): w_prime_cv = context_vectors[w_prime, topic_words] w_star_cv = context_vectors[w_star, topic_words] - s_cos_sim.append(_cossim(w_prime_cv, w_star_cv)) + segment_sims[i] = _cossim(w_prime_cv, w_star_cv) + s_cos_sim.append(np.mean(segment_sims)) return s_cos_sim diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 1b21334178..a44e57fb3e 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -410,8 +410,8 @@ def merge_accumulators(self, accumulators): occurrence and co-occurrence counts, and a `num_docs` that reflects the total observed by all the individual accumulators. """ - accumulator = accumulators[0] - for other_accumulator in accumulators[1:]: + accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) + for other_accumulator in accumulators: accumulator.merge(other_accumulator) # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized. # This is by design, to avoid unnecessary matrix additions/conversions during accumulation. From b941f3c25374a8ad9e200567f868e6fe1f06ce4d Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Wed, 7 Jun 2017 10:15:47 -0400 Subject: [PATCH 31/33] #1342: Fix `direct_confirmation_measure` functions to return individual topic coherence values, then average those. --- gensim/test/test_direct_confirmation.py | 9 ++++++--- .../topic_coherence/direct_confirmation_measure.py | 13 +++++++++---- .../indirect_confirmation_measure.py | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py index ad39b99b62..1d4f701cc9 100644 --- a/gensim/test/test_direct_confirmation.py +++ b/gensim/test/test_direct_confirmation.py @@ -33,21 +33,24 @@ def setUp(self): def testLogConditionalProbability(self): """Test log_conditional_probability()""" - obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.accumulator)[0] + obtained = direct_confirmation_measure.log_conditional_probability( + self.segmentation, self.accumulator)[0] # Answer should be ~ ln(1 / 2) = -0.693147181 expected = -0.693147181 self.assertAlmostEqual(obtained, expected) def testLogRatioMeasure(self): """Test log_ratio_measure()""" - obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator)[0] + obtained = direct_confirmation_measure.log_ratio_measure( + self.segmentation, self.accumulator)[0] # Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 expected = -0.182321557 self.assertAlmostEqual(obtained, expected) def testNormalizedLogRatioMeasure(self): """Test normalized_log_ratio_measure()""" - obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.accumulator, normalize=True)[0] + obtained = direct_confirmation_measure.log_ratio_measure( + self.segmentation, self.accumulator, normalize=True)[0] # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753 expected = -0.113282753 self.assertAlmostEqual(obtained, expected) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 29f68ad56e..467d134f29 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -31,11 +31,12 @@ def log_conditional_probability(segmented_topics, accumulator): Returns: ------- - m_lc : List of log conditional probability measure on each set in segmented topics. + m_lc : List of log conditional probability measure for each topic. """ m_lc = [] num_docs = float(accumulator.num_docs) for s_i in segmented_topics: + segment_sims = [] for w_prime, w_star in s_i: try: w_star_count = accumulator[w_star] @@ -44,7 +45,8 @@ def log_conditional_probability(segmented_topics, accumulator): except KeyError: m_lc_i = 0.0 - m_lc.append(m_lc_i) + segment_sims.append(m_lc_i) + m_lc.append(np.mean(segment_sims)) return m_lc @@ -70,11 +72,12 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False): Returns: ------- - m_lr : List of log ratio measures on each set in segmented topics. + m_lr : List of log ratio measures for each topic. """ m_lr = [] num_docs = float(accumulator.num_docs) for s_i in segmented_topics: + segment_sims = [] for w_prime, w_star in s_i: w_prime_count = accumulator[w_prime] w_star_count = accumulator[w_star] @@ -90,6 +93,8 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False): numerator = (co_occur_count / num_docs) + EPSILON denominator = (w_prime_count / num_docs) * (w_star_count / num_docs) m_lr_i = np.log(numerator / denominator) - m_lr.append(m_lr_i) + + segment_sims.append(m_lr_i) + m_lr.append(np.mean(segment_sims)) return m_lr diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index a05676ab61..07f221e941 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -60,7 +60,7 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm Returns: ------- - s_cos_sim : array of cosine similarity of the context vectors for each segmentation + s_cos_sim : list of indirect cosine similarity measure for each topic. """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) From 75fcac8c90ce17d5b315f1e422becc8cf8b64764 Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 8 Jun 2017 13:15:28 -0400 Subject: [PATCH 32/33] #1342: Hanging indents and switch out `union` with `update` for unique ids from topic segments. --- gensim/test/test_coherencemodel.py | 73 +++++++++++-------- gensim/test/test_probability_estimation.py | 18 +++-- .../topic_coherence/probability_estimation.py | 2 +- 3 files changed, 53 insertions(+), 40 deletions(-) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index d055523dff..039db55a48 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -31,15 +31,17 @@ def testfile(): class TestCoherenceModel(unittest.TestCase): # set up vars used in testing ("Deerwester" from the web tutorial) - texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] + ] dictionary = Dictionary(texts) @classmethod @@ -55,22 +57,28 @@ def setUp(self): ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] - self.ldamodel = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=2, - passes=0, iterations=0) + self.ldamodel = LdaModel( + corpus=self.corpus, id2word=self.dictionary, num_topics=2, + passes=0, iterations=0) + mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: - self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=self.corpus, - id2word=self.dictionary, num_topics=2, iterations=0) + self.malletmodel = LdaMallet( + mallet_path=self.mallet_path, corpus=self.corpus, + id2word=self.dictionary, num_topics=2, iterations=0) + vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: - logging.info("Environment variable 'VOWPAL_WABBIT_PATH' not specified," - " skipping sanity checks for LDA Model") + logging.info( + "Environment variable 'VOWPAL_WABBIT_PATH' not specified," + " skipping sanity checks for LDA Model") self.vw_path = None else: self.vw_path = vw_path - self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=self.corpus, - id2word=self.dictionary, num_topics=2, passes=0) + self.vwmodel = LdaVowpalWabbit( + self.vw_path, corpus=self.corpus, id2word=self.dictionary, + num_topics=2, passes=0) def check_coherence_measure(self, coherence): """Check provided topic coherence algorithm on given topics""" @@ -169,35 +177,38 @@ def testCnpmiVWModel(self): def testErrors(self): """Test if errors are raised on bad input""" # not providing dictionary - self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, - coherence='u_mass') + self.assertRaises( + ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, + coherence='u_mass') # not providing texts for c_v and instead providing corpus - self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, - dictionary=self.dictionary, coherence='c_v') + self.assertRaises( + ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, + dictionary=self.dictionary, coherence='c_v') # not providing corpus or texts for u_mass - self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, - dictionary=self.dictionary, coherence='u_mass') + self.assertRaises( + ValueError, CoherenceModel, topics=self.topics1, dictionary=self.dictionary, + coherence='u_mass') def testPersistence(self): fname = testfile() - model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, - coherence='u_mass') + model = CoherenceModel( + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) def testPersistenceCompressed(self): fname = testfile() + '.gz' - model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, - coherence='u_mass') + model = CoherenceModel( + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) def testPersistenceAfterProbabilityEstimationUsingCorpus(self): fname = testfile() - model = CoherenceModel(topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, - coherence='u_mass') + model = CoherenceModel( + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') model.estimate_probabilities() model.save(fname) model2 = CoherenceModel.load(fname) @@ -206,8 +217,8 @@ def testPersistenceAfterProbabilityEstimationUsingCorpus(self): def testPersistenceAfterProbabilityEstimationUsingTexts(self): fname = testfile() - model = CoherenceModel(topics=self.topics1, texts=self.texts, dictionary=self.dictionary, - coherence='c_v') + model = CoherenceModel( + topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v') model.estimate_probabilities() model.save(fname) model2 = CoherenceModel.load(fname) diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py index 982230a526..1e674415f3 100644 --- a/gensim/test/test_probability_estimation.py +++ b/gensim/test/test_probability_estimation.py @@ -11,20 +11,22 @@ import logging import unittest -from gensim.topic_coherence import probability_estimation -from gensim.corpora.hashdictionary import HashDictionary from gensim.corpora.dictionary import Dictionary +from gensim.corpora.hashdictionary import HashDictionary +from gensim.topic_coherence import probability_estimation class BaseTestCases(object): class ProbabilityEstimationBase(unittest.TestCase): - texts = [['human', 'interface', 'computer'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees']] + texts = [ + ['human', 'interface', 'computer'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'] + ] dictionary = None def build_segmented_topics(self): diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 0c62d68985..552fe5c4d7 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -70,7 +70,7 @@ def unique_ids_from_segments(segmented_topics): for s_i in segmented_topics: for word_id in itertools.chain.from_iterable(s_i): if hasattr(word_id, '__iter__'): - top_ids = top_ids.union(word_id) + top_ids.update(word_id) else: top_ids.add(word_id) From 96d1349691b3729f2ae66f4e71c818a5cc1169db Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Fri, 9 Jun 2017 09:15:38 -0400 Subject: [PATCH 33/33] #1342: Clarify documentation in the `probability_estimation` module. --- .../topic_coherence/probability_estimation.py | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 552fe5c4d7..85e787de18 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -18,9 +18,9 @@ def p_boolean_document(corpus, segmented_topics): - """ - This function performs the boolean document probability estimation. Boolean document estimates the probability - of a single word as the number of documents in which the word occurs divided by the total number of documents. + """This function performs the boolean document probability estimation. + Boolean document estimates the probability of a single word as the number + of documents in which the word occurs divided by the total number of documents. Args: ---- @@ -29,19 +29,19 @@ def p_boolean_document(corpus, segmented_topics): Returns: ------- - per_topic_postings : Boolean document posting list for each unique topic id. - num_docs : Total number of documents in corpus. + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1): - """ - This function performs the boolean sliding window probability estimation. Boolean sliding window - determines word counts using a sliding window. The window moves over the documents one word token per step. - Each step defines a new virtual document by copying the window content. Boolean document is applied to - these virtual documents to compute word probabilities. + """This function performs the boolean sliding window probability estimation. + Boolean sliding window determines word counts using a sliding window. The window + moves over the documents one word token per step. Each step defines a new virtual + document by copying the window content. Boolean document is applied to these virtual + documents to compute word probabilities. Args: ---- @@ -52,8 +52,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Returns: ------- - per_topic_postings : Boolean sliding window postings list of all the unique topic ids. - window_id[0] : Total no of windows + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: @@ -65,13 +65,21 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def unique_ids_from_segments(segmented_topics): - """Return the set of all unique ids in a list of segmented topics.""" - top_ids = set() # is a set of all the unique ids contained in topics. + """Return the set of all unique ids in a list of segmented topics. + + Args: + ---- + segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set + is either a single integer, or a `numpy.ndarray` of integers. + Returns: + unique_ids : set of unique ids across all topic segments. + """ + unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: for word_id in itertools.chain.from_iterable(s_i): if hasattr(word_id, '__iter__'): - top_ids.update(word_id) + unique_ids.update(word_id) else: - top_ids.add(word_id) + unique_ids.add(word_id) - return top_ids + return unique_ids