Skip to content

Commit

Permalink
[SPARK-46513][PS][TESTS] Move BasicIndexingTests to `pyspark.pandas…
Browse files Browse the repository at this point in the history
….tests.indexes.*`

### What changes were proposed in this pull request?
Move `BasicIndexingTests` to `pyspark.pandas.tests.indexes.*`

### Why are the changes needed?
test code clean up

### Does this PR introduce _any_ user-facing change?
no, test-only

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #44499 from zhengruifeng/ps_test_index_basic.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
zhengruifeng authored and HyukjinKwon committed Dec 26, 2023
1 parent fcf5d57 commit fb09e31
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 146 deletions.
3 changes: 2 additions & 1 deletion dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,7 @@ def __hash__(self):
"pyspark.pandas.tests.indexes.test_datetime_round",
"pyspark.pandas.tests.indexes.test_align",
"pyspark.pandas.tests.indexes.test_indexing",
"pyspark.pandas.tests.indexes.test_indexing_basic",
"pyspark.pandas.tests.indexes.test_reindex",
"pyspark.pandas.tests.indexes.test_rename",
"pyspark.pandas.tests.indexes.test_reset_index",
Expand Down Expand Up @@ -1088,6 +1089,7 @@ def __hash__(self):
"pyspark.pandas.tests.connect.indexes.test_parity_map",
"pyspark.pandas.tests.connect.indexes.test_parity_align",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic",
"pyspark.pandas.tests.connect.indexes.test_parity_reindex",
"pyspark.pandas.tests.connect.indexes.test_parity_rename",
"pyspark.pandas.tests.connect.indexes.test_parity_reset_index",
Expand Down Expand Up @@ -1136,7 +1138,6 @@ def __hash__(self):
"pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
"pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
"pyspark.pandas.tests.connect.test_parity_indexing",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,21 @@
#
import unittest

import pandas as pd

from pyspark import pandas as ps
from pyspark.pandas.tests.test_indexing import BasicIndexingTestsMixin
from pyspark.pandas.tests.indexes.test_indexing_basic import BasicIndexingTestsMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class BasicIndexingParityTests(
BasicIndexingTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
BasicIndexingTestsMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
@property
def pdf(self):
return pd.DataFrame(
{"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]}
)

@property
def psdf(self):
return ps.from_pandas(self.pdf)
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.test_parity_indexing import * # noqa: F401
from pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic import * # noqa: F401

try:
import xmlrunner # type: ignore[import]
Expand Down
171 changes: 171 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_indexing_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

import pandas as pd

from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase, compare_both


class BasicIndexingTestsMixin:
@property
def pdf(self):
return pd.DataFrame(
{"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]}
)

@property
def psdf(self):
return ps.from_pandas(self.pdf)

@compare_both(almost=False)
def test_indexing(self, df):
df1 = df.set_index("month")
yield df1

yield df.set_index("month", drop=False)
yield df.set_index("month", append=True)
yield df.set_index(["year", "month"])
yield df.set_index(["year", "month"], drop=False)
yield df.set_index(["year", "month"], append=True)

yield df1.set_index("year", drop=False, append=True)

df2 = df1.copy()
df2.set_index("year", append=True, inplace=True)
yield df2

self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index("unknown"))
self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index(["month", "unknown"]))

for d in [df, df1, df2]:
yield d.reset_index()
yield d.reset_index(drop=True)

yield df1.reset_index(level=0)
yield df2.reset_index(level=1)
yield df2.reset_index(level=[1, 0])
yield df1.reset_index(level="month")
yield df2.reset_index(level="year")
yield df2.reset_index(level=["month", "year"])
yield df2.reset_index(level="month", drop=True)
yield df2.reset_index(level=["month", "year"], drop=True)

self.assertRaisesRegex(
IndexError,
"Too many levels: Index has only 1 level, not 3",
lambda: df1.reset_index(level=2),
)
self.assertRaisesRegex(
IndexError,
"Too many levels: Index has only 1 level, not 4",
lambda: df1.reset_index(level=[3, 2]),
)
self.assertRaisesRegex(KeyError, "unknown.*month", lambda: df1.reset_index(level="unknown"))
self.assertRaisesRegex(
KeyError, "Level unknown not found", lambda: df2.reset_index(level="unknown")
)

df3 = df2.copy()
df3.reset_index(inplace=True)
yield df3

yield df1.sale.reset_index()
yield df1.sale.reset_index(level=0)
yield df2.sale.reset_index(level=[1, 0])
yield df1.sale.reset_index(drop=True)
yield df1.sale.reset_index(name="s")
yield df1.sale.reset_index(name="s", drop=True)

s = df1.sale
self.assertRaisesRegex(
TypeError,
"Cannot reset_index inplace on a Series to create a DataFrame",
lambda: s.reset_index(inplace=True),
)
s.reset_index(drop=True, inplace=True)
yield s
yield df1

# multi-index columns
df4 = df.copy()
df4.columns = pd.MultiIndex.from_tuples(
[("cal", "month"), ("cal", "year"), ("num", "sale")]
)
df5 = df4.set_index(("cal", "month"))
yield df5
yield df4.set_index([("cal", "month"), ("num", "sale")])

self.assertRaises(KeyError, lambda: df5.reset_index(level=("cal", "month")))

yield df5.reset_index(level=[("cal", "month")])

# non-string names
df6 = df.copy()
df6.columns = [10.0, 20.0, 30.0]
df7 = df6.set_index(10.0)
yield df7
yield df6.set_index([10.0, 30.0])

yield df7.reset_index(level=10.0)
yield df7.reset_index(level=[10.0])

df8 = df.copy()
df8.columns = pd.MultiIndex.from_tuples([(10, "month"), (10, "year"), (20, "sale")])
df9 = df8.set_index((10, "month"))
yield df9
yield df8.set_index([(10, "month"), (20, "sale")])

yield df9.reset_index(level=[(10, "month")])

def test_from_pandas_with_explicit_index(self):
pdf = self.pdf

df1 = ps.from_pandas(pdf.set_index("month"))
self.assertPandasEqual(df1._to_pandas(), pdf.set_index("month"))

df2 = ps.from_pandas(pdf.set_index(["year", "month"]))
self.assertPandasEqual(df2._to_pandas(), pdf.set_index(["year", "month"]))

def test_limitations(self):
df = self.psdf.set_index("month")

self.assertRaisesRegex(
ValueError,
"Level should be all int or all string.",
lambda: df.reset_index([1, "month"]),
)


class BasicIndexingTests(
BasicIndexingTestsMixin,
PandasOnSparkTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.indexes.test_indexing_basic import * # noqa: F401

try:
import xmlrunner

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
130 changes: 0 additions & 130 deletions python/pyspark/pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,132 +26,6 @@
from pyspark.testing.pandasutils import ComparisonTestBase, compare_both


class BasicIndexingTestsMixin:
@property
def pdf(self):
return pd.DataFrame(
{"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]}
)

@compare_both(almost=False)
def test_indexing(self, df):
df1 = df.set_index("month")
yield df1

yield df.set_index("month", drop=False)
yield df.set_index("month", append=True)
yield df.set_index(["year", "month"])
yield df.set_index(["year", "month"], drop=False)
yield df.set_index(["year", "month"], append=True)

yield df1.set_index("year", drop=False, append=True)

df2 = df1.copy()
df2.set_index("year", append=True, inplace=True)
yield df2

self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index("unknown"))
self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index(["month", "unknown"]))

for d in [df, df1, df2]:
yield d.reset_index()
yield d.reset_index(drop=True)

yield df1.reset_index(level=0)
yield df2.reset_index(level=1)
yield df2.reset_index(level=[1, 0])
yield df1.reset_index(level="month")
yield df2.reset_index(level="year")
yield df2.reset_index(level=["month", "year"])
yield df2.reset_index(level="month", drop=True)
yield df2.reset_index(level=["month", "year"], drop=True)

self.assertRaisesRegex(
IndexError,
"Too many levels: Index has only 1 level, not 3",
lambda: df1.reset_index(level=2),
)
self.assertRaisesRegex(
IndexError,
"Too many levels: Index has only 1 level, not 4",
lambda: df1.reset_index(level=[3, 2]),
)
self.assertRaisesRegex(KeyError, "unknown.*month", lambda: df1.reset_index(level="unknown"))
self.assertRaisesRegex(
KeyError, "Level unknown not found", lambda: df2.reset_index(level="unknown")
)

df3 = df2.copy()
df3.reset_index(inplace=True)
yield df3

yield df1.sale.reset_index()
yield df1.sale.reset_index(level=0)
yield df2.sale.reset_index(level=[1, 0])
yield df1.sale.reset_index(drop=True)
yield df1.sale.reset_index(name="s")
yield df1.sale.reset_index(name="s", drop=True)

s = df1.sale
self.assertRaisesRegex(
TypeError,
"Cannot reset_index inplace on a Series to create a DataFrame",
lambda: s.reset_index(inplace=True),
)
s.reset_index(drop=True, inplace=True)
yield s
yield df1

# multi-index columns
df4 = df.copy()
df4.columns = pd.MultiIndex.from_tuples(
[("cal", "month"), ("cal", "year"), ("num", "sale")]
)
df5 = df4.set_index(("cal", "month"))
yield df5
yield df4.set_index([("cal", "month"), ("num", "sale")])

self.assertRaises(KeyError, lambda: df5.reset_index(level=("cal", "month")))

yield df5.reset_index(level=[("cal", "month")])

# non-string names
df6 = df.copy()
df6.columns = [10.0, 20.0, 30.0]
df7 = df6.set_index(10.0)
yield df7
yield df6.set_index([10.0, 30.0])

yield df7.reset_index(level=10.0)
yield df7.reset_index(level=[10.0])

df8 = df.copy()
df8.columns = pd.MultiIndex.from_tuples([(10, "month"), (10, "year"), (20, "sale")])
df9 = df8.set_index((10, "month"))
yield df9
yield df8.set_index([(10, "month"), (20, "sale")])

yield df9.reset_index(level=[(10, "month")])

def test_from_pandas_with_explicit_index(self):
pdf = self.pdf

df1 = ps.from_pandas(pdf.set_index("month"))
self.assertPandasEqual(df1._to_pandas(), pdf.set_index("month"))

df2 = ps.from_pandas(pdf.set_index(["year", "month"]))
self.assertPandasEqual(df2._to_pandas(), pdf.set_index(["year", "month"]))

def test_limitations(self):
df = self.psdf.set_index("month")

self.assertRaisesRegex(
ValueError,
"Level should be all int or all string.",
lambda: df.reset_index([1, "month"]),
)


class IndexingTest(ComparisonTestBase):
@property
def pdf(self):
Expand Down Expand Up @@ -1320,10 +1194,6 @@ def test_index_operator_int(self):
psdf.iloc[[1, 1]]


class BasicIndexingTests(BasicIndexingTestsMixin, ComparisonTestBase):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.test_indexing import * # noqa: F401

Expand Down

0 comments on commit fb09e31

Please sign in to comment.