diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 552fe5c4d7..85e787de18 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -18,9 +18,9 @@ def p_boolean_document(corpus, segmented_topics): - """ - This function performs the boolean document probability estimation. Boolean document estimates the probability - of a single word as the number of documents in which the word occurs divided by the total number of documents. + """This function performs the boolean document probability estimation. + Boolean document estimates the probability of a single word as the number + of documents in which the word occurs divided by the total number of documents. Args: ---- @@ -29,19 +29,19 @@ def p_boolean_document(corpus, segmented_topics): Returns: ------- - per_topic_postings : Boolean document posting list for each unique topic id. - num_docs : Total number of documents in corpus. + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1): - """ - This function performs the boolean sliding window probability estimation. Boolean sliding window - determines word counts using a sliding window. The window moves over the documents one word token per step. - Each step defines a new virtual document by copying the window content. Boolean document is applied to - these virtual documents to compute word probabilities. + """This function performs the boolean sliding window probability estimation. + Boolean sliding window determines word counts using a sliding window. The window + moves over the documents one word token per step. Each step defines a new virtual + document by copying the window content. Boolean document is applied to these virtual + documents to compute word probabilities. Args: ---- @@ -52,8 +52,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Returns: ------- - per_topic_postings : Boolean sliding window postings list of all the unique topic ids. - window_id[0] : Total no of windows + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: @@ -65,13 +65,21 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def unique_ids_from_segments(segmented_topics): - """Return the set of all unique ids in a list of segmented topics.""" - top_ids = set() # is a set of all the unique ids contained in topics. + """Return the set of all unique ids in a list of segmented topics. + + Args: + ---- + segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set + is either a single integer, or a `numpy.ndarray` of integers. + Returns: + unique_ids : set of unique ids across all topic segments. + """ + unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: for word_id in itertools.chain.from_iterable(s_i): if hasattr(word_id, '__iter__'): - top_ids.update(word_id) + unique_ids.update(word_id) else: - top_ids.add(word_id) + unique_ids.add(word_id) - return top_ids + return unique_ids