piskvorky#1342: Clarify documentation in the probability_estimation…

… module.
macks22 · Jun 9, 2017 · 96d1349 · 96d1349
1 parent 75fcac8
commit 96d1349
Showing 1 changed file with 25 additions and 17 deletions.
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
@@ -18,9 +18,9 @@
 
 
 def p_boolean_document(corpus, segmented_topics):
-    """
-    This function performs the boolean document probability estimation. Boolean document estimates the probability
-    of a single word as the number of documents in which the word occurs divided by the total number of documents.
+    """This function performs the boolean document probability estimation.
+    Boolean document estimates the probability of a single word as the number
+    of documents in which the word occurs divided by the total number of documents.
 
     Args:
     ----
@@ -29,19 +29,19 @@ def p_boolean_document(corpus, segmented_topics):
 
     Returns:
     -------
-    per_topic_postings : Boolean document posting list for each unique topic id.
-    num_docs : Total number of documents in corpus.
+    accumulator : word occurrence accumulator instance that can be used to lookup token
+                  frequencies and co-occurrence frequencies.
     """
     top_ids = unique_ids_from_segments(segmented_topics)
     return CorpusAccumulator(top_ids).accumulate(corpus)
 
 
 def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1):
-    """
-    This function performs the boolean sliding window probability estimation. Boolean sliding window
-    determines word counts using a sliding window. The window moves over the documents one word token per step.
-    Each step defines a new virtual document by copying the window content. Boolean document is applied to
-    these virtual documents to compute word probabilities.
+    """This function performs the boolean sliding window probability estimation.
+    Boolean sliding window determines word counts using a sliding window. The window
+    moves over  the documents one word token per step. Each step defines a new virtual
+    document  by copying the window content. Boolean document is applied to these virtual
+    documents to compute word probabilities.
 
     Args:
     ----
@@ -52,8 +52,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
 
     Returns:
     -------
-    per_topic_postings : Boolean sliding window postings list of all the unique topic ids.
-    window_id[0] : Total no of windows
+    accumulator : word occurrence accumulator instance that can be used to lookup token
+                  frequencies and co-occurrence frequencies.
     """
     top_ids = unique_ids_from_segments(segmented_topics)
     if processes <= 1:
@@ -65,13 +65,21 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
 
 
 def unique_ids_from_segments(segmented_topics):
-    """Return the set of all unique ids in a list of segmented topics."""
-    top_ids = set()  # is a set of all the unique ids contained in topics.
+    """Return the set of all unique ids in a list of segmented topics.
+
+    Args:
+    ----
+    segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set
+                      is either a single integer, or a `numpy.ndarray` of integers.
+    Returns:
+    unique_ids : set of unique ids across all topic segments.
+    """
+    unique_ids = set()  # is a set of all the unique ids contained in topics.
     for s_i in segmented_topics:
         for word_id in itertools.chain.from_iterable(s_i):
             if hasattr(word_id, '__iter__'):
-                top_ids.update(word_id)
+                unique_ids.update(word_id)
             else:
-                top_ids.add(word_id)
+                unique_ids.add(word_id)
 
-    return top_ids
+    return unique_ids