Skip to content

Commit

Permalink
[SPARK-7242] added python api for freqItems in DataFrames
Browse files Browse the repository at this point in the history
The python api for DataFrame's plus addressed your comments from previous PR.
rxin

Author: Burak Yavuz <[email protected]>

Closes apache#5859 from brkyvz/df-freq-py2 and squashes the following commits:

f9aa9ce [Burak Yavuz] addressed comments v0.1
4b25056 [Burak Yavuz] added python api for freqItems
  • Loading branch information
brkyvz authored and rxin committed May 2, 2015
1 parent b79aeb9 commit 2e0f357
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 3 deletions.
25 changes: 25 additions & 0 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,26 @@ def cov(self, col1, col2):
raise ValueError("col2 should be a string.")
return self._jdf.stat().cov(col1, col2)

def freqItems(self, cols, support=None):
"""
Finding frequent items for columns, possibly with false positives. Using the
frequent element count algorithm described in
"http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
strings.
:param support: The frequency with which to consider an item 'frequent'. Default is 1%.
The support must be greater than 1e-4.
"""
if isinstance(cols, tuple):
cols = list(cols)
if not isinstance(cols, list):
raise ValueError("cols must be a list or tuple of column names as strings.")
if not support:
support = 0.01
return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)

@ignore_unicode_prefix
def withColumn(self, colName, col):
"""Returns a new :class:`DataFrame` by adding a column.
Expand Down Expand Up @@ -1344,6 +1364,11 @@ def cov(self, col1, col2):

cov.__doc__ = DataFrame.cov.__doc__

def freqItems(self, cols, support=None):
return self.df.freqItems(cols, support)

freqItems.__doc__ = DataFrame.freqItems.__doc__


def _test():
import doctest
Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,13 @@ def test_column_select(self):
self.assertEqual(self.testData, df.select(df.key, df.value).collect())
self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect())

def test_freqItems(self):
vals = [Row(a=1, b=-2.0) if i % 2 == 0 else Row(a=i, b=i * 1.0) for i in range(100)]
df = self.sc.parallelize(vals).toDF()
items = df.stat.freqItems(("a", "b"), 0.4).collect()[0]
self.assertTrue(1 in items[0])
self.assertTrue(-2.0 in items[1])

def test_aggregator(self):
df = self.df
g = df.groupBy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
}

/**
* Runs `freqItems` with a default `support` of 1%.
* Finding frequent items for columns, possibly with false positives. Using the
* frequent element count algorithm described in
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
* Uses a `default` support of 1%.
*
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
Expand All @@ -55,14 +58,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
/**
* Python friendly implementation for `freqItems`
*/
def freqItems(cols: List[String], support: Double): DataFrame = {
def freqItems(cols: Seq[String], support: Double): DataFrame = {
FrequentItems.singlePassFreqItems(df, cols, support)
}

/**
* Python friendly implementation for `freqItems` with a default `support` of 1%.
*/
def freqItems(cols: List[String]): DataFrame = {
def freqItems(cols: Seq[String]): DataFrame = {
FrequentItems.singlePassFreqItems(df, cols, 0.01)
}

Expand Down

0 comments on commit 2e0f357

Please sign in to comment.