Skip to content

Commit

Permalink
piskvorky#1342: Clarify documentation in the probability_estimation
Browse files Browse the repository at this point in the history
… module.
  • Loading branch information
Sweeney, Mack committed Jun 9, 2017
1 parent 75fcac8 commit 96d1349
Showing 1 changed file with 25 additions and 17 deletions.
42 changes: 25 additions & 17 deletions gensim/topic_coherence/probability_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@


def p_boolean_document(corpus, segmented_topics):
"""
This function performs the boolean document probability estimation. Boolean document estimates the probability
of a single word as the number of documents in which the word occurs divided by the total number of documents.
"""This function performs the boolean document probability estimation.
Boolean document estimates the probability of a single word as the number
of documents in which the word occurs divided by the total number of documents.
Args:
----
Expand All @@ -29,19 +29,19 @@ def p_boolean_document(corpus, segmented_topics):
Returns:
-------
per_topic_postings : Boolean document posting list for each unique topic id.
num_docs : Total number of documents in corpus.
accumulator : word occurrence accumulator instance that can be used to lookup token
frequencies and co-occurrence frequencies.
"""
top_ids = unique_ids_from_segments(segmented_topics)
return CorpusAccumulator(top_ids).accumulate(corpus)


def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1):
"""
This function performs the boolean sliding window probability estimation. Boolean sliding window
determines word counts using a sliding window. The window moves over the documents one word token per step.
Each step defines a new virtual document by copying the window content. Boolean document is applied to
these virtual documents to compute word probabilities.
"""This function performs the boolean sliding window probability estimation.
Boolean sliding window determines word counts using a sliding window. The window
moves over the documents one word token per step. Each step defines a new virtual
document by copying the window content. Boolean document is applied to these virtual
documents to compute word probabilities.
Args:
----
Expand All @@ -52,8 +52,8 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p
Returns:
-------
per_topic_postings : Boolean sliding window postings list of all the unique topic ids.
window_id[0] : Total no of windows
accumulator : word occurrence accumulator instance that can be used to lookup token
frequencies and co-occurrence frequencies.
"""
top_ids = unique_ids_from_segments(segmented_topics)
if processes <= 1:
Expand All @@ -65,13 +65,21 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p


def unique_ids_from_segments(segmented_topics):
"""Return the set of all unique ids in a list of segmented topics."""
top_ids = set() # is a set of all the unique ids contained in topics.
"""Return the set of all unique ids in a list of segmented topics.
Args:
----
segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set
is either a single integer, or a `numpy.ndarray` of integers.
Returns:
unique_ids : set of unique ids across all topic segments.
"""
unique_ids = set() # is a set of all the unique ids contained in topics.
for s_i in segmented_topics:
for word_id in itertools.chain.from_iterable(s_i):
if hasattr(word_id, '__iter__'):
top_ids.update(word_id)
unique_ids.update(word_id)
else:
top_ids.add(word_id)
unique_ids.add(word_id)

return top_ids
return unique_ids

0 comments on commit 96d1349

Please sign in to comment.