Micro-Fix for aggregation.py, partially refactored direct_confirmatio…

…n.py
piskvorky · menshikh-iv · Jan 10, 2018 · Nov 13, 2017 · Nov 14, 2017 · Nov 15, 2017
commit 56eda2314678d83b336812cfb5e37b30d0be7d52
diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py
@@ -25,7 +25,7 @@ def arithmetic_mean(confirmed_measures):
 
     Returns
     -------
-    float
+    numpy.float
         Arithmetic mean of all the values contained in confirmation measures.
 
     Examples

diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -19,22 +19,43 @@
 
 def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False):
     """
-    This function calculates the log-conditional-probability measure
+    Calculate the log-conditional-probability measure
     which is used by coherence measures such as U_mass.
     This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)]
 
-    Args:
-        segmented_topics (list): Output from the segmentation module of the segmented
-            topics. Is a list of list of tuples.
-        accumulator: word occurrence accumulator from probability_estimation.
-        with_std (bool): True to also include standard deviation across topic segment
-            sets in addition to the mean coherence for each topic; default is False.
-        with_support (bool): True to also include support across topic segments. The
-            support is defined as the number of pairwise similarity comparisons were
-            used to compute the overall topic coherence.
-
-    Returns:
+    Parameters
+    ----------
+    segmented_topics : list
+        Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    accumulator : list
+        Word occurrence accumulator from probability_estimation.
+    with_std : bool
+        True to also include standard deviation across topic segment
+        sets in addition to the mean coherence for each topic; default is False.
+    with_support : bool
+        True to also include support across topic segments. The
+        support is defined as the number of pairwise similarity comparisons were
+        used to compute the overall topic coherence.
+
+    Returns
+    -------
         list : of log conditional probability measure for each topic.
+
+    Examples
+    --------
+    >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis
+    >>> from collections import namedtuple
+    >>> id2token = {1: 'test', 2: 'doc'}
+    >>> token2id = {v: k for k, v in id2token.items()}
+    >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+    >>> segmentation = [[(1, 2)]]
+    >>> num_docs = 5
+    >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
+    >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
+    >>> accumulator._num_docs = num_docs
+    >>> direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0]
+    Answer should be ~ ln(1 / 2) = -0.693147181
+
     """
     topic_coherences = []
     num_docs = float(accumulator.num_docs)
@@ -59,14 +80,20 @@ def aggregate_segment_sims(segment_sims, with_std, with_support):
     """Compute various statistics from the segment similarities generated via
     set pairwise comparisons of top-N word lists for a single topic.
 
-    Args:
-        segment_sims (iterable): floating point similarity values to aggregate.
-        with_std (bool): Set to True to include standard deviation.
-        with_support (bool): Set to True to include number of elements in `segment_sims`
-            as a statistic in the results returned.
+    Parameters
+    ----------
+        segment_sims : iterable
+            floating point similarity values to aggregate.
+        with_std : bool
+            Set to True to include standard deviation.
+        with_support : bool
+            Set to True to include number of elements in `segment_sims` as a statistic in the results returned.
+
+    Returns
+    -------
+    tuple
+        tuple with (mean[, std[, support]])
 
-    Returns:
-        tuple: with (mean[, std[, support]])
     """
     mean = np.mean(segment_sims)
     stats = [mean]
@@ -83,27 +110,49 @@ def log_ratio_measure(
     """
     If normalize=False:
         Popularly known as PMI.
-        This function calculates the log-ratio-measure which is used by
+        Calculate the log-ratio-measure which is used by
         coherence measures such as c_v.
         This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
 
     If normalize=True:
-        This function calculates the normalized-log-ratio-measure, popularly knowns as
+        Calculate the normalized-log-ratio-measure, popularly knowns as
         NPMI which is used by coherence measures such as c_v.
         This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
 
-    Args:
-        segmented_topics (list): Output from the segmentation module of the segmented
-            topics. Is a list of list of tuples.
-        accumulator: word occurrence accumulator from probability_estimation.
-        with_std (bool): True to also include standard deviation across topic segment
-            sets in addition to the mean coherence for each topic; default is False.
-        with_support (bool): True to also include support across topic segments. The
-            support is defined as the number of pairwise similarity comparisons were
-            used to compute the overall topic coherence.
-
-    Returns:
-        list : of log ratio measure for each topic.
+    Parameters
+    ----------
+    segmented_topics : list
+        Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    accumulator: list
+        word occurrence accumulator from probability_estimation.
+    with_std : bool
+        True to also include standard deviation across topic segment
+        sets in addition to the mean coherence for each topic; default is False.
+    with_support : bool
+        True to also include support across topic segments. The
+        support is defined as the number of pairwise similarity comparisons were
+        used to compute the overall topic coherence.
+
+    Returns
+    -------
+    list
+        List of log ratio measure for each topic.
+
+    Examples
+    --------
+    >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis
+    >>> from collections import namedtuple
+    >>> id2token = {1: 'test', 2: 'doc'}
+    >>> token2id = {v: k for k, v in id2token.items()}
+    >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+    >>> segmentation = [[(1, 2)]]
+    >>> num_docs = 5
+    >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
+    >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
+    >>> accumulator._num_docs = num_docs
+    >>> direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0]
+    Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
+
     """
     topic_coherences = []
     num_docs = float(accumulator.num_docs)