diff --git a/docs/src/topic_coherence/text_analysis.rst b/docs/src/topic_coherence/text_analysis.rst index f4e3f7254e..ec9e14a795 100644 --- a/docs/src/topic_coherence/text_analysis.rst +++ b/docs/src/topic_coherence/text_analysis.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 02b18984ac..5463e8a025 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -560,10 +560,10 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, Args: corpus (gensim corpus): The corpus with which the author-topic model should be updated. - author2doc (dictionary): author to document mapping corresponding to indexes in input + author2doc (dict): author to document mapping corresponding to indexes in input corpus. - doc2author (dictionary): document to author mapping corresponding to indexes in input + doc2author (dict): document to author mapping corresponding to indexes in input corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 065943a28f..aa27c833f7 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -4,10 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to perform aggregation on a list of values -obtained from the confirmation measure. -""" +"""This module contains functions to perform aggregation on a list of values obtained from the confirmation measure.""" import logging import numpy as np @@ -17,13 +14,24 @@ def arithmetic_mean(confirmed_measures): """ - This functoin performs the arithmetic mean aggregation on the output obtained from + Perform the arithmetic mean aggregation on the output obtained from the confirmation measure module. - Args: - confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. + Parameters + ---------- + confirmed_measures : list of float + List of calculated confirmation measure on each set in the segmented topics. + + Returns + ------- + `numpy.float` + Arithmetic mean of all the values contained in confirmation measures. + + Examples + -------- + >>> from gensim.topic_coherence.aggregation import arithmetic_mean + >>> arithmetic_mean([1.1, 2.2, 3.3, 4.4]) + 2.75 - Returns: - mean : Arithmetic mean of all the values contained in confirmation measures. """ return np.mean(confirmed_measures) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index dfda360447..6482191d9c 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -4,9 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to compute direct confirmation on a pair of words or word subsets. -""" +"""This module contains functions to compute direct confirmation on a pair of words or word subsets.""" import logging @@ -14,27 +12,53 @@ logger = logging.getLogger(__name__) -EPSILON = 1e-12 # Should be small. Value as suggested in paper. +# Should be small. Value as suggested in paper http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf +EPSILON = 1e-12 def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False): - """ - This function calculates the log-conditional-probability measure - which is used by coherence measures such as U_mass. + """Calculate the log-conditional-probability measure which is used by coherence measures such as `U_mass`. This is defined as :math:`m_{lc}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W^{*})}`. - Args: - segmented_topics (list): Output from the segmentation module of the segmented - topics. Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - with_std (bool): True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. - with_support (bool): True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. - - Returns: - list : of log conditional probability measure for each topic. + Parameters + ---------- + segmented_topics : list of lists of (int, int) + Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`, + :func:`~gensim.topic_coherence.segmentation.s_one_one`. + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. + with_std : bool, optional + True to also include standard deviation across topic segment sets in addition to the mean coherence + for each topic. + with_support : bool, optional + True to also include support across topic segments. The support is defined as the number of pairwise + similarity comparisons were used to compute the overall topic coherence. + + Returns + ------- + list of float + Log conditional probabilities measurement for each topic. + + Examples + -------- + >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis + >>> from collections import namedtuple + >>> + >>> # Create dictionary + >>> id2token = {1: 'test', 2: 'doc'} + >>> token2id = {v: k for k, v in id2token.items()} + >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + >>> + >>> # Initialize segmented topics and accumulator + >>> segmentation = [[(1, 2)]] + >>> + >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + >>> accumulator._num_docs = 5 + >>> + >>> # result should be ~ ln(1 / 2) = -0.693147181 + >>> result = direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0] + """ topic_coherences = [] num_docs = float(accumulator.num_docs) @@ -56,17 +80,33 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w def aggregate_segment_sims(segment_sims, with_std, with_support): - """Compute various statistics from the segment similarities generated via - set pairwise comparisons of top-N word lists for a single topic. - - Args: - segment_sims (iterable): floating point similarity values to aggregate. - with_std (bool): Set to True to include standard deviation. - with_support (bool): Set to True to include number of elements in `segment_sims` - as a statistic in the results returned. + """Compute various statistics from the segment similarities generated via set pairwise comparisons + of top-N word lists for a single topic. + + Parameters + ---------- + segment_sims : iterable of float + Similarity values to aggregate. + with_std : bool + Set to True to include standard deviation. + with_support : bool + Set to True to include number of elements in `segment_sims` as a statistic in the results returned. + + Returns + ------- + (float[, float[, int]]) + Tuple with (mean[, std[, support]]). + + Examples + --------- + >>> from gensim.topic_coherence import direct_confirmation_measure + >>> + >>> segment_sims = [0.2, 0.5, 1., 0.05] + >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, True, True) + (0.4375, 0.36293077852394939, 4) + >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, False, False) + 0.4375 - Returns: - tuple: with (mean[, std[, support]]) """ mean = np.mean(segment_sims) stats = [mean] @@ -78,32 +118,61 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): return stats[0] if len(stats) == 1 else tuple(stats) -def log_ratio_measure( - segmented_topics, accumulator, normalize=False, with_std=False, with_support=False): - """ - If normalize=False: - Popularly known as PMI. - This function calculates the log-ratio-measure which is used by - coherence measures such as c_v. - This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] - - If normalize=True: - This function calculates the normalized-log-ratio-measure, popularly knowns as - NPMI which is used by coherence measures such as c_v. - This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] - - Args: - segmented_topics (list): Output from the segmentation module of the segmented - topics. Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - with_std (bool): True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. - with_support (bool): True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. - - Returns: - list : of log ratio measure for each topic. +def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=False, with_support=False): + """Compute log ratio measure for `segment_topics`. + + Parameters + ---------- + segmented_topics : list of lists of (int, int) + Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`, + :func:`~gensim.topic_coherence.segmentation.s_one_one`. + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. + normalize : bool, optional + Details in the "Notes" section. + with_std : bool, optional + True to also include standard deviation across topic segment sets in addition to the mean coherence + for each topic. + with_support : bool, optional + True to also include support across topic segments. The support is defined as the number of pairwise + similarity comparisons were used to compute the overall topic coherence. + + Notes + ----- + If `normalize=False`: + Calculate the log-ratio-measure, popularly known as **PMI** which is used by coherence measures such as `c_v`. + This is defined as :math:`m_{lr}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W') * P(W^{*})}` + + If `normalize=True`: + Calculate the normalized-log-ratio-measure, popularly knowns as **NPMI** + which is used by coherence measures such as `c_v`. + This is defined as :math:`m_{nlr}(S_i) = \\frac{m_{lr}(S_i)}{-log(P(W', W^{*}) + \epsilon)}` + + Returns + ------- + list of float + Log ratio measurements for each topic. + + Examples + -------- + >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis + >>> from collections import namedtuple + >>> + >>> # Create dictionary + >>> id2token = {1: 'test', 2: 'doc'} + >>> token2id = {v: k for k, v in id2token.items()} + >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + >>> + >>> # Initialize segmented topics and accumulator + >>> segmentation = [[(1, 2)]] + >>> + >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + >>> accumulator._num_docs = 5 + >>> + >>> # result should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 + >>> result = direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0] + """ topic_coherences = [] num_docs = float(accumulator.num_docs) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 33b42223bb..fdcbd1565f 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -4,11 +4,12 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -r""" -This module contains functions to compute confirmation on a pair of words or word subsets. +r"""This module contains functions to compute confirmation on a pair of words or word subsets. -The advantage of indirect confirmation measure is that it computes similarity of words in W' and -W* with respect to direct confirmations to all words. Eg. Suppose x and z are both competing +Notes +----- +The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and +:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing brands of cars, which semantically support each other. However, both brands are seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road” or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. @@ -16,15 +17,17 @@ The formula used to compute indirect confirmation measure is - m_{sim}_{(m, \gamma)}(W', W*) = - s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) +.. math:: -where s_sim can be cosine, dice or jaccard similarity and + \widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*})) - \vec{V}^{\,}_{m,\gamma}(W') = - \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} -Here 'm' is the direct confirmation measure used. +where :math:`s_{sim}` can be cosine, dice or jaccard similarity and + +.. math:: + + \vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} + """ import itertools @@ -33,28 +36,53 @@ import numpy as np import scipy.sparse as sps -from gensim.topic_coherence.direct_confirmation_measure import ( - aggregate_segment_sims, log_ratio_measure) +from gensim.topic_coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure logger = logging.getLogger(__name__) def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False): """For each topic segmentation, compute average cosine similarity using a - WordVectorsAccumulator. - - Args: - segmented_topics (list): Output from the segmentation module of the segmented - topics. Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - with_std (bool): True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. - with_support (bool): True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. - - Returns: - list : of word2vec cosine similarities per topic. + :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`. + + Parameters + ---------- + segmented_topics : list of lists of (int, `numpy.ndarray`) + Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`. + accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator. + with_std : bool, optional + True to also include standard deviation across topic segment sets + in addition to the mean coherence for each topic. + with_support : bool, optional + True to also include support across topic segments. The support is defined as + the number of pairwise similarity comparisons were used to compute the overall topic coherence. + + Returns + ------- + list of (float[, float[, int]]) + Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`). + + Examples + -------- + >>> import numpy as np + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import indirect_confirmation_measure + >>> from gensim.topic_coherence import text_analysis + >>> + >>> # create segmentation + >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] + >>> + >>> # create accumulator + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) + >>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> + >>> # should be (0.726752426218 0.00695475919227) + >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0] + """ topic_coherences = [] total_oov = 0 @@ -85,44 +113,54 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp return topic_coherences -def cosine_similarity( - segmented_topics, accumulator, topics, measure='nlr', gamma=1, - with_std=False, with_support=False): - r""" - This function calculates the indirect cosine measure. - - Given context vectors u = V(W') and w = V(W*) for the - word sets of a pair S_i = (W', W*) indirect cosine measure - is computed as the cosine similarity between u and w. - - The formula used is - - m_{sim}_{(m, \gamma)}(W', W*) = - s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) - - where each vector - - \vec{V}^{\,}_{m,\gamma}(W') = - \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} - - Args: - segmented_topics: Output from the segmentation module of the - segmented topics. Is a list of list of tuples. - accumulator: Output from the probability_estimation module. Is an - accumulator of word occurrences (see text_analysis module). - topics: Topics obtained from the trained topic model. - measure (str): Direct confirmation measure to be used. Supported - values are "nlr" (normalized log ratio). - gamma: Gamma value for computing W', W* vectors; default is 1. - with_std (bool): True to also include standard deviation across topic - segment sets in addition to the mean coherence for each topic; - default is False. - with_support (bool): True to also include support across topic segments. - The support is defined as the number of pairwise similarity - comparisons were used to compute the overall topic coherence. - - Returns: - list: of indirect cosine similarity measure for each topic. +def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', + gamma=1, with_std=False, with_support=False): + """Calculate the indirect cosine measure. + + Parameters + ---------- + segmented_topics: list of lists of (int, `numpy.ndarray`) + Output from the segmentation module of the segmented topics. + accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model. + measure : str, optional + Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + gamma: float, optional + Gamma value for computing :math:`W'` and :math:`W^{*}` vectors. + with_std : bool + True to also include standard deviation across topic segment sets in addition to the mean coherence + for each topic; default is False. + with_support : bool + True to also include support across topic segments. The support is defined as the number of pairwise similarity + comparisons were used to compute the overall topic coherence. + + Returns + ------- + list + List of indirect cosine similarity measure for each topic. + + Examples + -------- + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis + >>> import numpy as np + >>> + >>> # create accumulator + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + >>> accumulator._num_docs = 5 + >>> + >>> # create topics + >>> topics = [np.array([1, 2])] + >>> + >>> # create segmentation + >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] + >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1) + >>> print obtained[0] + 0.623018926945 + """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) @@ -141,9 +179,52 @@ def cosine_similarity( class ContextVectorComputer(object): - """Lazily compute context vectors for topic segments.""" + """Lazily compute context vectors for topic segments. + + Parameters + ---------- + measure: str + Confirmation measure. + topics: list of numpy.array + Topics. + accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from probability_estimation. + gamma: float + Value for computing vectors. + + Attributes + ---------- + sim_cache: dict + Cache similarities between tokens (pairs of word ids), e.g. (1, 2). + context_vector_cache: dict + Mapping from (segment, topic_words) --> context_vector. + + Example + ------- + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis + >>> import numpy as np + >>> + >>> # create measure, topics + >>> measure = 'nlr' + >>> topics = [np.array([1, 2])] + >>> + >>> # create accumulator + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) + >>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1) + >>> cont_vect_comp.mapping + {1: 0, 2: 1} + >>> cont_vect_comp.vocab_size + 2 + + """ def __init__(self, measure, topics, accumulator, gamma): + if measure == 'nlr': self.similarity = _pair_npmi else: @@ -154,16 +235,27 @@ def __init__(self, measure, topics, accumulator, gamma): self.vocab_size = len(self.mapping) self.accumulator = accumulator self.gamma = gamma - self.sim_cache = {} # Cache similarities between tokens (pairs of word ids), e.g. (1, 2) - self.context_vector_cache = {} # mapping from (segment, topic_words) --> context_vector + self.sim_cache = {} + self.context_vector_cache = {} def __getitem__(self, idx): return self.compute_context_vector(*idx) def compute_context_vector(self, segment_word_ids, topic_word_ids): - """ - Step 1. Check if (segment_word_ids, topic_word_ids) context vector has been cached. - Step 2. If yes, return corresponding context vector, else compute, cache, and return. + """Check if (segment_word_ids, topic_word_ids) context vector has been cached. + + Parameters + ---------- + segment_word_ids: list + Ids of words in segment. + topic_word_ids: list + Ids of words in topic. + Returns + ------- + csr_matrix :class:`~scipy.sparse.csr` + If context vector has been cached, then return corresponding context vector, + else compute, cache, and return. + """ key = _key_for_segment(segment_word_ids, topic_word_ids) context_vector = self.context_vector_cache.get(key, None) @@ -173,7 +265,20 @@ def compute_context_vector(self, segment_word_ids, topic_word_ids): return context_vector def _make_seg(self, segment_word_ids, topic_word_ids): - """Internal helper function to return context vectors for segmentations.""" + """Return context vectors for segmentation (Internal helper function). + + Parameters + ---------- + segment_word_ids : iterable or int + Ids of words in segment. + topic_word_ids : list + Ids of words in topic. + Returns + ------- + csr_matrix :class:`~scipy.sparse.csr` + Matrix in Compressed Sparse Row format + + """ context_vector = sps.lil_matrix((self.vocab_size, 1)) if not hasattr(segment_word_ids, '__iter__'): segment_word_ids = (segment_word_ids,) @@ -190,8 +295,20 @@ def _make_seg(self, segment_word_ids, topic_word_ids): def _pair_npmi(pair, accumulator): - """Compute normalized pairwise mutual information (NPMI) between a pair of words. - The pair is an iterable of (word_id1, word_id2). + """Compute normalized pairwise mutual information (**NPMI**) between a pair of words. + + Parameters + ---------- + pair : (int, int) + The pair of words (word_id1, word_id2). + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from probability_estimation. + + Return + ------ + float + NPMI between a pair of words. + """ return log_ratio_measure([[pair]], accumulator, True)[0] diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index f59692bdcc..404310a36c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -4,9 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to perform segmentation on a list of topics. -""" +"""This module contains functions to perform segmentation on a list of topics.""" import itertools import logging @@ -19,38 +17,119 @@ def p_boolean_document(corpus, segmented_topics): - """This function performs the boolean document probability estimation. - Boolean document estimates the probability of a single word as the number - of documents in which the word occurs divided by the total number of documents. + """Perform the boolean document probability estimation. Boolean document estimates the probability of a single word + as the number of documents in which the word occurs divided by the total number of documents. + + Parameters + ---------- + corpus : iterable of list of (int, int) + The corpus of documents. + segmented_topics: list of (int, int). + Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. + + Returns + ------- + :class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator` + Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. + + Examples + --------- + >>> from gensim.topic_coherence import probability_estimation + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> + >>> + >>> texts = [ + ... ['human', 'interface', 'computer'], + ... ['eps', 'user', 'interface', 'system'], + ... ['system', 'human', 'system', 'eps'], + ... ['user', 'response', 'time'], + ... ['trees'], + ... ['graph', 'trees'] + ... ] + >>> dictionary = HashDictionary(texts) + >>> w2id = dictionary.token2id + >>> + >>> # create segmented_topics + >>> segmented_topics = [ + ... [(w2id['system'], w2id['graph']),(w2id['computer'], w2id['graph']),(w2id['computer'], w2id['system'])], + ... [(w2id['computer'], w2id['graph']),(w2id['user'], w2id['graph']),(w2id['user'], w2id['computer'])] + ... ] + >>> + >>> # create corpus + >>> corpus = [dictionary.doc2bow(text) for text in texts] + >>> + >>> result = probability_estimation.p_boolean_document(corpus, segmented_topics) + >>> result.index_to_dict() + {10608: set([0]), 12736: set([1, 3]), 18451: set([5]), 5798: set([1, 2])} - Args: - corpus : The corpus of documents. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. - - Returns: - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1): - """This function performs the boolean sliding window probability estimation. + """Perform the boolean sliding window probability estimation. + + Parameters + ---------- + texts : iterable of iterable of str + Input text + segmented_topics: list of (int, int) + Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Gensim dictionary mapping of the tokens and ids. + window_size : int + Size of the sliding window, 110 found out to be the ideal size for large corpora. + processes : int, optional + Number of process that will be used for + :class:`~gensim.topic_coherence.text_analysis.ParallelWordOccurrenceAccumulator` + + Notes + ----- Boolean sliding window determines word counts using a sliding window. The window moves over the documents one word token per step. Each step defines a new virtual document by copying the window content. Boolean document is applied to these virtual documents to compute word probabilities. - Args: - texts : List of string sentences. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. - dictionary : Gensim dictionary mapping of the tokens and ids. - window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. + Returns + ------- + :class:`~gensim.topic_coherence.text_analysis.WordOccurrenceAccumulator` + if `processes` = 1 OR + :class:`~gensim.topic_coherence.text_analysis.ParallelWordOccurrenceAccumulator` + otherwise. This is word occurrence accumulator instance that can be used to lookup + token frequencies and co-occurrence frequencies. + + Examples + --------- + >>> from gensim.topic_coherence import probability_estimation + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> + >>> + >>> texts = [ + ... ['human', 'interface', 'computer'], + ... ['eps', 'user', 'interface', 'system'], + ... ['system', 'human', 'system', 'eps'], + ... ['user', 'response', 'time'], + ... ['trees'], + ... ['graph', 'trees'] + ... ] + >>> dictionary = HashDictionary(texts) + >>> w2id = dictionary.token2id + + >>> + >>> # create segmented_topics + >>> segmented_topics = [ + ... [(w2id['system'], w2id['graph']),(w2id['computer'], w2id['graph']),(w2id['computer'], w2id['system'])], + ... [(w2id['computer'], w2id['graph']),(w2id['user'], w2id['graph']),(w2id['user'], w2id['computer'])] + ... ] + >>> + >>> # create corpus + >>> corpus = [dictionary.doc2bow(text) for text in texts] + >>> accumulator = probability_estimation.p_boolean_sliding_window(texts, segmented_topics, dictionary, 2) + >>> + >>> (accumulator[w2id['computer']], accumulator[w2id['user']], accumulator[w2id['system']]) + (1, 3, 4) - Returns: - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: @@ -62,10 +141,59 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes=1, model=None): - """Train word2vec model on `texts` if model is not None. - Returns: - ---- - accumulator: text accumulator with trained context vectors. + """Train word2vec model on `texts` if `model` is not None. + + Parameters + ---------- + texts : iterable of iterable of str + Input text + segmented_topics : iterable of iterable of str + Output from the segmentation of topics. Could be simply topics too. + dictionary : :class:`~gensim.corpora.dictionary` + Gensim dictionary mapping of the tokens and ids. + window_size : int, optional + Size of the sliding window. + processes : int, optional + Number of processes to use. + model : :class:`~gensim.models.word2vec.Word2Vec` or :class:`~gensim.models.keyedvectors.KeyedVectors`, optional + If None, a new Word2Vec model is trained on the given text corpus. Otherwise, + it should be a pre-trained Word2Vec context vectors. + + Returns + ------- + :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` + Text accumulator with trained context vectors. + + Examples + -------- + >>> from gensim.topic_coherence import probability_estimation + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> from gensim.models import word2vec + >>> + >>> texts = [ + ... ['human', 'interface', 'computer'], + ... ['eps', 'user', 'interface', 'system'], + ... ['system', 'human', 'system', 'eps'], + ... ['user', 'response', 'time'], + ... ['trees'], + ... ['graph', 'trees'] + ... ] + >>> dictionary = HashDictionary(texts) + >>> w2id = dictionary.token2id + + >>> + >>> # create segmented_topics + >>> segmented_topics = [ + ... [(w2id['system'], w2id['graph']),(w2id['computer'], w2id['graph']),(w2id['computer'], w2id['system'])], + ... [(w2id['computer'], w2id['graph']),(w2id['user'], w2id['graph']),(w2id['user'], w2id['computer'])] + ... ] + >>> + >>> # create corpus + >>> corpus = [dictionary.doc2bow(text) for text in texts] + >>> sentences = [['human', 'interface', 'computer'],['survey', 'user', 'computer', 'system', 'response', 'time']] + >>> model = word2vec.Word2Vec(sentences, size=100,min_count=1) + >>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model) + """ top_ids = unique_ids_from_segments(segmented_topics) accumulator = WordVectorsAccumulator( @@ -76,11 +204,24 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= def unique_ids_from_segments(segmented_topics): """Return the set of all unique ids in a list of segmented topics. - Args: - segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set - is either a single integer, or a `numpy.ndarray` of integers. - Returns: - unique_ids : set of unique ids across all topic segments. + Parameters + ---------- + segmented_topics: list of (int, int). + Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. + + Returns + ------- + set + Set of unique ids across all topic segments. + + Example + ------- + >>> from gensim.topic_coherence import probability_estimation + >>> + >>> segmentation = [[(1, 2)]] + >>> probability_estimation.unique_ids_from_segments(segmentation) + set([1, 2]) + """ unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 8d3185dbbb..9629369b42 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -4,9 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to perform segmentation on a list of topics. -""" +"""This module contains functions to perform segmentation on a list of topics.""" import logging @@ -14,21 +12,32 @@ def s_one_pre(topics): - """ - This function performs s_one_pre segmentation on a list of topics. - s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} - Example: - - >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] - >>> s_one_pre(topics) - [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] + """Performs segmentation on a list of topics. + + Notes + ----- + Segmentation is defined as + :math:`s_{pre} = {(W', W^{*}) | W' = w_{i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i > j}`. + + Parameters + ---------- + topics : list of np.array + list of topics obtained from an algorithm such as LDA. + + Returns + ------- + list of list of (int, int) + :math:`(W', W^{*})` for all unique topic ids. + + Examples + -------- + >>> import numpy as np + >>> from gensim.topic_coherence import segmentation + >>> + >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> segmentation.s_one_pre(topics) + [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] - Args: - topics : list of topics obtained from an algorithm such as LDA. - Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - - Returns: - s_one_pre_res : list of list of (W', W*) tuples for all unique topic ids """ s_one_pre_res = [] @@ -43,21 +52,29 @@ def s_one_pre(topics): def s_one_one(topics): - """ - This function performs s_one_one segmentation on a list of topics. - s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i != j} - Example: + """Perform segmentation on a list of topics. + Segmentation is defined as + :math:`s_{one} = {(W', W^{*}) | W' = {w_i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i \\neq j}`. + + Parameters + ---------- + topics : list of `numpy.ndarray` + List of topics obtained from an algorithm such as LDA. + + Returns + ------- + list of list of (int, int). + :math:`(W', W^{*})` for all unique topic ids. + + Examples + ------- + >>> import numpy as np + >>> from gensim.topic_coherence import segmentation + >>> + >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> segmentation.s_one_one(topics) + [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] - >>> s_one_pre(topics) - [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - - Args: - topics : list of topics obtained from an algorithm such as LDA. - Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - - Returns: - s_one_one_res : list of list of (W', W*) tuples for all unique topic ids """ s_one_one_res = [] @@ -75,22 +92,29 @@ def s_one_one(topics): def s_one_set(topics): - """ - This function performs s_one_set segmentation on a list of topics. - s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; W* = W} - Example: - >>> topics = [np.array([9, 10, 7]) - >>> s_one_set(topics) - [[(9, array([ 9, 10, 7])), - (10, array([ 9, 10, 7])), - (7, array([ 9, 10, 7]))]] - - Args: - topics : list of topics obtained from an algorithm such as LDA. - Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - - Returns: - s_one_set_res : list of list of (W', W*) tuples for all unique topic ids. + """Perform s_one_set segmentation on a list of topics. + Segmentation is defined as + :math:`s_{set} = {(W', W^{*}) | W' = {w_i}; w_{i} \in W; W^{*} = W}` + + Parameters + ---------- + topics : list of `numpy.ndarray` + List of topics obtained from an algorithm such as LDA. + + Returns + ------- + list of list of (int, int). + :math:`(W', W^{*})` for all unique topic ids. + + Examples + -------- + >>> import numpy as np + >>> from gensim.topic_coherence import segmentation + >>> + >>> topics = [np.array([9, 10, 7])] + >>> segmentation.s_one_set(topics) + [[(9, array([ 9, 10, 7])), (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] + """ s_one_set_res = [] diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 340286c8d1..b759e0a13a 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -4,10 +4,8 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains classes for analyzing the texts of a corpus to accumulate -statistical information about word occurrences. -""" +"""This module contains classes for analyzing the texts of a corpus to accumulate +statistical information about word occurrences.""" import itertools import logging @@ -27,11 +25,32 @@ def _ids_to_words(ids, dictionary): """Convert an iterable of ids to their corresponding words using a dictionary. - This function abstracts away the differences between the HashDictionary and the standard one. + Abstract away the differences between the HashDictionary and the standard one. + + Parameters + ---------- + ids: dict + Dictionary of ids and their words. + dictionary: :class:`~gensim.corpora.dictionary.Dictionary` + Input gensim dictionary + + Returns + ------- + set + Corresponding words. + + Examples + -------- + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import text_analysis + >>> + >>> dictionary = Dictionary() + >>> ids = {1: 'fake', 4: 'cats'} + >>> dictionary.id2token = {1: 'fake', 2: 'tokens', 3: 'rabbids', 4: 'cats'} + >>> + >>> text_analysis._ids_to_words(ids, dictionary) + set(['cats', 'fake']) - Args: - ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). - This is the format returned by the topic_coherence.segmentation functions. """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) @@ -48,9 +67,40 @@ def _ids_to_words(ids, dictionary): class BaseAnalyzer(object): - """Base class for corpus and text analyzers.""" + """Base class for corpus and text analyzers. + + Attributes + ---------- + relevant_ids : dict + Mapping + _vocab_size : int + Size of vocabulary. + id2contiguous : dict + Mapping word_id -> number. + log_every : int + Interval for logging. + _num_docs : int + Number of documents. + """ def __init__(self, relevant_ids): + """ + + Parameters + ---------- + relevant_ids : dict + Mapping + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> ids = {1: 'fake', 4: 'cats'} + >>> base = text_analysis.BaseAnalyzer(ids) + >>> # should return {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 + >>> print base.relevant_ids, base._vocab_size, base.id2contiguous, base.log_every, base._num_docs + {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 + + """ self.relevant_ids = relevant_ids self._vocab_size = len(self.relevant_ids) self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} @@ -97,9 +147,40 @@ class UsesDictionary(BaseAnalyzer): """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts. The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id mapping. - """ + Attributes + ---------- + relevant_words : set + Set of words that occurrences should be accumulated for. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary based on text + token2id : dict + Mapping from :class:`~gensim.corpora.dictionary.Dictionary` + + """ def __init__(self, relevant_ids, dictionary): + """ + + Parameters + ---------- + relevant_ids : dict + Mapping + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary based on text + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> from gensim.corpora.dictionary import Dictionary + >>> + >>> ids = {1: 'foo', 2: 'bar'} + >>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) + >>> udict = text_analysis.UsesDictionary(ids, dictionary) + >>> + >>> print udict.relevant_words + set([u'foo', u'baz']) + + """ super(UsesDictionary, self).__init__(relevant_ids) self.relevant_words = _ids_to_words(self.relevant_ids, dictionary) self.dictionary = dictionary @@ -131,6 +212,24 @@ class InvertedIndexBased(BaseAnalyzer): """Analyzer that builds up an inverted index to accumulate stats.""" def __init__(self, *args): + """ + + Parameters + ---------- + args : dict + Look at :class:`~gensim.topic_coherence.text_analysis.BaseAnalyzer` + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> + >>> ids = {1: 'fake', 4: 'cats'} + >>> ininb = text_analysis.InvertedIndexBased(ids) + >>> + >>> print ininb._inverted_index + [set([]) set([])] + + """ super(InvertedIndexBased, self).__init__(*args) self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) @@ -151,6 +250,7 @@ class CorpusAccumulator(InvertedIndexBased): """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" def analyze_text(self, text, doc_num=None): + """Build an inverted index from a sequence of corpus texts.""" doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) for word_id in top_ids_in_doc: @@ -168,9 +268,14 @@ class WindowedTextsAnalyzer(UsesDictionary): def __init__(self, relevant_ids, dictionary): """ - Args: - relevant_ids: the set of words that occurrences should be accumulated for. - dictionary: Dictionary instance with mappings for the relevant_ids. + + Parameters + ---------- + relevant_ids : set of int + Relevant id + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary instance with mappings for the relevant_ids. + """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) self._none_token = self._vocab_size # see _iter_texts for use of none token @@ -195,7 +300,7 @@ def _iter_texts(self, texts): for w in text], dtype=dtype) def text_is_relevant(self, text): - """Return True if the text has any relevant words, else False.""" + """Check if the text has any relevant words.""" for word in text: if word in self.relevant_words: return True @@ -232,10 +337,14 @@ def accumulate(self, texts, window_size): return self def partial_accumulate(self, texts, window_size): - """Meant to be called several times to accumulate partial results. The final - accumulation should be performed with the `accumulate` method as opposed to this one. + """Meant to be called several times to accumulate partial results. + + Notes + ----- + The final accumulation should be performed with the `accumulate` method as opposed to this one. This method does not ensure the co-occurrence matrix is in lil format and does not symmetrize it after accumulation. + """ self._current_doc_num = -1 self._token_at_edge = None @@ -267,8 +376,12 @@ def _slide_window(self, window, doc_num): def _symmetrize(self): """Word pairs may have been encountered in (i, j) and (j, i) order. + + Notes + ----- Rather than enforcing a particular ordering during the update process, we choose to symmetrize the co-occurrence matrix after accumulation has completed. + """ co_occ = self._co_occurrences co_occ.setdiag(self._occurrences) # diagonal should be equal to occurrence counts @@ -288,24 +401,26 @@ def merge(self, other): class PatchedWordOccurrenceAccumulator(WordOccurrenceAccumulator): - """Monkey patched for multiprocessing worker usage, - to move some of the logic to the master process. - """ + """Monkey patched for multiprocessing worker usage, to move some of the logic to the master process.""" def _iter_texts(self, texts): return texts # master process will handle this class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): - """Accumulate word occurrences in parallel.""" + """Accumulate word occurrences in parallel. + + Attributes + ---------- + processes : int + Number of processes to use; must be at least two. + args : + Should include `relevant_ids` and `dictionary` (see :class:`~UsesDictionary.__init__`). + kwargs : + Can include `batch_size`, which is the number of docs to send to a worker at a time. + If not included, it defaults to 64. + """ def __init__(self, processes, *args, **kwargs): - """ - Args: - processes : number of processes to use; must be at least two. - args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). - kwargs : can include `batch_size`, which is the number of docs to send to a worker at a - time. If not included, it defaults to 64. - """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: raise ValueError( @@ -332,9 +447,19 @@ def accumulate(self, texts, window_size): def start_workers(self, window_size): """Set up an input and output queue and start processes for each worker. + Notes + ----- The input queue is used to transmit batches of documents to the workers. The output queue is used by workers to transmit the WordOccurrenceAccumulator instances. - Returns: tuple of (list of workers, input queue, output queue). + + Parameters + ---------- + window_size : int + + Returns + ------- + (list of lists) + Tuple of (list of workers, input queue, output queue). """ input_q = mp.Queue(maxsize=self.processes) output_q = mp.Queue() @@ -348,9 +473,7 @@ def start_workers(self, window_size): return workers, input_q, output_q def yield_batches(self, texts): - """Return a generator over the given texts that yields batches of - `batch_size` texts at a time. - """ + """Return a generator over the given texts that yields batches of `batch_size` texts at a time.""" batch = [] for text in self._iter_texts(texts): batch.append(text) @@ -375,17 +498,19 @@ def queue_all_texts(self, q, texts, window_size): (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) def terminate_workers(self, input_q, output_q, workers, interrupted=False): - """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, - then terminate each. We do not use join here because it has been shown to have some issues - in Python 2.7 (and even in later versions). This method also closes both the input and output - queue. + """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, then terminate each. + Warnings + -------- + We do not use join here because it has been shown to have some issues + in Python 2.7 (and even in later versions). This method also closes both the input and output queue. If `interrupted` is False (normal execution), a None value is placed on the input queue for each worker. The workers are looking for this sentinel value and interpret it as a signal to terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are programmed to recover from this and continue on to transmit their results before terminating. So in this instance, the sentinel values are not queued, but the rest of the execution continues as usual. + """ if not interrupted: for _ in workers: @@ -408,6 +533,7 @@ def merge_accumulators(self, accumulators): """Merge the list of accumulators into a single `WordOccurrenceAccumulator` with all occurrence and co-occurrence counts, and a `num_docs` that reflects the total observed by all the individual accumulators. + """ accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) for other_accumulator in accumulators: @@ -469,17 +595,18 @@ def reply_to_master(self): class WordVectorsAccumulator(UsesDictionary): - """Accumulate context vectors for words using word vector embeddings.""" + """Accumulate context vectors for words using word vector embeddings. + + Attributes + ---------- + model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) + If None, a new Word2Vec model is trained on the given text corpus. Otherwise, + it should be a pre-trained Word2Vec context vectors. + model_kwargs: + if model is None, these keyword arguments will be passed through to the Word2Vec constructor. + """ def __init__(self, relevant_ids, dictionary, model=None, **model_kwargs): - """ - Args: - model: if None, a new Word2Vec model is trained on the given text corpus. - If not None, it should be a pre-trained Word2Vec context vectors - (gensim.models.keyedvectors.KeyedVectors instance). - model_kwargs: if model is None, these keyword arguments will be passed - through to the Word2Vec constructor. - """ super(WordVectorsAccumulator, self).__init__(relevant_ids, dictionary) self.model = model self.model_kwargs = model_kwargs