piskvorky · menshikh-iv · Jan 10, 2018 · Nov 13, 2017 · Nov 14, 2017 · Nov 15, 2017
diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py
@@ -4,10 +4,8 @@
 # Copyright (C) 2013 Radim Rehurek <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-This module contains functions to perform aggregation on a list of values
-obtained from the confirmation measure.
-"""
+"""This module contains functions to perform aggregation on a list of values
+obtained from the confirmation measure."""
 
 import logging
 import numpy as np
@@ -17,13 +15,24 @@
 
 def arithmetic_mean(confirmed_measures):
     """
-    This functoin performs the arithmetic mean aggregation on the output obtained from
+    Perform the arithmetic mean aggregation on the output obtained from
     the confirmation measure module.
 
-    Args:
-        confirmed_measures : list of calculated confirmation measure on each set in the segmented topics.
+    Parameters
+    ----------
+    confirmed_measures : list
+        List of calculated confirmation measure on each set in the segmented topics.
+
+    Returns
+    -------
+    numpy.float
+        Arithmetic mean of all the values contained in confirmation measures.
+
+    Examples
+    --------
+    >>> from gensim.topic_coherence.aggregation import arithmetic_mean
+    >>> arithmetic_mean([1.1, 2.2, 3.3, 4.4])
+    2.75
 
-    Returns:
-        mean : Arithmetic mean of all the values contained in confirmation measures.
     """
     return np.mean(confirmed_measures)
diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -19,22 +19,43 @@
 
 def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False):
     """
-    This function calculates the log-conditional-probability measure
+    Calculate the log-conditional-probability measure
     which is used by coherence measures such as U_mass.
     This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)]
 
-    Args:
-        segmented_topics (list): Output from the segmentation module of the segmented
-            topics. Is a list of list of tuples.
-        accumulator: word occurrence accumulator from probability_estimation.
-        with_std (bool): True to also include standard deviation across topic segment
-            sets in addition to the mean coherence for each topic; default is False.
-        with_support (bool): True to also include support across topic segments. The
-            support is defined as the number of pairwise similarity comparisons were
-            used to compute the overall topic coherence.
-
-    Returns:
+    Parameters
+    ----------
+    segmented_topics : list
+        Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    accumulator : list
+        Word occurrence accumulator from probability_estimation.
+    with_std : bool
+        True to also include standard deviation across topic segment
+        sets in addition to the mean coherence for each topic; default is False.
+    with_support : bool
+        True to also include support across topic segments. The
+        support is defined as the number of pairwise similarity comparisons were
+        used to compute the overall topic coherence.
+
+    Returns
+    -------
         list : of log conditional probability measure for each topic.
+
+    Examples
+    --------
+    >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis
+    >>> from collections import namedtuple
+    >>> id2token = {1: 'test', 2: 'doc'}
+    >>> token2id = {v: k for k, v in id2token.items()}
+    >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+    >>> segmentation = [[(1, 2)]]
+    >>> num_docs = 5
+    >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
+    >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
+    >>> accumulator._num_docs = num_docs
+    >>> direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0]
+    Answer should be ~ ln(1 / 2) = -0.693147181
+
     """
     topic_coherences = []
     num_docs = float(accumulator.num_docs)
@@ -59,14 +80,20 @@ def aggregate_segment_sims(segment_sims, with_std, with_support):
     """Compute various statistics from the segment similarities generated via
     set pairwise comparisons of top-N word lists for a single topic.
 
-    Args:
-        segment_sims (iterable): floating point similarity values to aggregate.
-        with_std (bool): Set to True to include standard deviation.
-        with_support (bool): Set to True to include number of elements in `segment_sims`
-            as a statistic in the results returned.
+    Parameters
+    ----------
+        segment_sims : iterable
+            floating point similarity values to aggregate.
+        with_std : bool
+            Set to True to include standard deviation.
+        with_support : bool
+            Set to True to include number of elements in `segment_sims` as a statistic in the results returned.
+
+    Returns
+    -------
+    tuple
+        tuple with (mean[, std[, support]])
 
-    Returns:
-        tuple: with (mean[, std[, support]])
     """
     mean = np.mean(segment_sims)
     stats = [mean]
@@ -83,27 +110,49 @@ def log_ratio_measure(
     """
     If normalize=False:
         Popularly known as PMI.
-        This function calculates the log-ratio-measure which is used by
+        Calculate the log-ratio-measure which is used by
         coherence measures such as c_v.
         This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
 
     If normalize=True:
-        This function calculates the normalized-log-ratio-measure, popularly knowns as
+        Calculate the normalized-log-ratio-measure, popularly knowns as
         NPMI which is used by coherence measures such as c_v.
         This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
 
-    Args:
-        segmented_topics (list): Output from the segmentation module of the segmented
-            topics. Is a list of list of tuples.
-        accumulator: word occurrence accumulator from probability_estimation.
-        with_std (bool): True to also include standard deviation across topic segment
-            sets in addition to the mean coherence for each topic; default is False.
-        with_support (bool): True to also include support across topic segments. The
-            support is defined as the number of pairwise similarity comparisons were
-            used to compute the overall topic coherence.
-
-    Returns:
-        list : of log ratio measure for each topic.
+    Parameters
+    ----------
+    segmented_topics : list of (list of tuples)
+        Output from the segmentation module of the segmented topics.
+    accumulator: list
+        word occurrence accumulator from probability_estimation.
+    with_std : bool
+        True to also include standard deviation across topic segment
+        sets in addition to the mean coherence for each topic; default is False.
+    with_support : bool
+        True to also include support across topic segments. The
+        support is defined as the number of pairwise similarity comparisons were
+        used to compute the overall topic coherence.
+
+    Returns
+    -------
+    list
+        List of log ratio measure for each topic.
+
+    Examples
+    --------
+    >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis
+    >>> from collections import namedtuple
+    >>> id2token = {1: 'test', 2: 'doc'}
+    >>> token2id = {v: k for k, v in id2token.items()}
+    >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
+    >>> segmentation = [[(1, 2)]]
+    >>> num_docs = 5
+    >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
+    >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
+    >>> accumulator._num_docs = num_docs
+    >>> direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0]
+    Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
+
     """
     topic_coherences = []
     num_docs = float(accumulator.num_docs)

diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -43,18 +43,40 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp
     """For each topic segmentation, compute average cosine similarity using a
     WordVectorsAccumulator.
 
-    Args:
-        segmented_topics (list): Output from the segmentation module of the segmented
-            topics. Is a list of list of tuples.
-        accumulator: word occurrence accumulator from probability_estimation.
-        with_std (bool): True to also include standard deviation across topic segment
-            sets in addition to the mean coherence for each topic; default is False.
-        with_support (bool): True to also include support across topic segments. The
-            support is defined as the number of pairwise similarity comparisons were
-            used to compute the overall topic coherence.
+    Parameters
+    ----------
+    segmented_topics : list of (list of tuples)
+        Output from the segmentation module of the segmented topics.
+    accumulator: list
+        Word occurrence accumulator from probability_estimation.
+    with_std : bool
+        True to also include standard deviation across topic segment
+        sets in addition to the mean coherence for each topic; default is False.
+    with_support : bool
+        True to also include support across topic segments. The
+        support is defined as the number of pairwise similarity comparisons were
+        used to compute the overall topic coherence.
+
+    Returns
+    -------
+    list
+        List of word2vec cosine similarities per topic.
+
+    Examples
+    --------
+    >>> from gensim.corpora.dictionary import Dictionary
+    >>> import numpy as np
+    >>> from gensim.topic_coherence import indirect_confirmation_measure
+    >>> from gensim.topic_coherence import text_analysis
+    >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
+    >>> dictionary = Dictionary()
+    >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
+    >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
+    >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5)
+    >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]
+    >>> print mean, std
+    0.726752426218 0.00695475919227
 
-    Returns:
-        list : of word2vec cosine similarities per topic.
     """
     topic_coherences = []
     total_oov = 0
@@ -88,8 +110,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp
 def cosine_similarity(
         segmented_topics, accumulator, topics, measure='nlr', gamma=1,
         with_std=False, with_support=False):
-    r"""
-    This function calculates the indirect cosine measure.
+    r"""Calculate the indirect cosine measure.
 
     Given context vectors u = V(W') and w = V(W*) for the
     word sets of a pair S_i = (W', W*) indirect cosine measure
@@ -123,6 +144,7 @@ def cosine_similarity(
 
     Returns:
         list: of indirect cosine similarity measure for each topic.
+
     """
     context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)