forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-46517][PS][TESTS][FOLLOWUPS] Reorganize
IndexingTest
: factor…
… out `test_iloc*` ### What changes were proposed in this pull request? factor out `test_iloc*` and add the missing parity test ### Why are the changes needed? for test parity and parallelism ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#44510 from zhengruifeng/ps_test_indexing_iloc. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Kent Yao <[email protected]>
- Loading branch information
1 parent
3f08c9f
commit 7cad075
Showing
4 changed files
with
346 additions
and
217 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_iloc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
import unittest | ||
|
||
from pyspark.pandas.tests.indexes.test_indexing_iloc import IndexingILocMixin | ||
from pyspark.testing.connectutils import ReusedConnectTestCase | ||
from pyspark.testing.pandasutils import PandasOnSparkTestUtils | ||
|
||
|
||
class IndexingILocParityTests( | ||
IndexingILocMixin, | ||
PandasOnSparkTestUtils, | ||
ReusedConnectTestCase, | ||
): | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
from pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc import * # noqa | ||
|
||
try: | ||
import xmlrunner # type: ignore[import] | ||
|
||
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) | ||
except ImportError: | ||
testRunner = None | ||
unittest.main(testRunner=testRunner, verbosity=2) |
303 changes: 303 additions & 0 deletions
303
python/pyspark/pandas/tests/indexes/test_indexing_iloc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,303 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
import unittest | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from pyspark import pandas as ps | ||
from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError | ||
from pyspark.testing.pandasutils import PandasOnSparkTestCase | ||
from pyspark.testing.sqlutils import SQLTestUtils | ||
|
||
|
||
class IndexingILocMixin: | ||
@property | ||
def pdf(self): | ||
return pd.DataFrame( | ||
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, | ||
index=[0, 1, 3, 5, 6, 8, 9, 9, 9], | ||
) | ||
|
||
@property | ||
def pdf2(self): | ||
return pd.DataFrame( | ||
{0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]}, | ||
index=[0, 1, 3, 5, 6, 8, 9, 9, 9], | ||
) | ||
|
||
@property | ||
def psdf(self): | ||
return ps.from_pandas(self.pdf) | ||
|
||
@property | ||
def psdf2(self): | ||
return ps.from_pandas(self.pdf2) | ||
|
||
def test_iloc(self): | ||
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
self.assert_eq(psdf.iloc[0, 0], pdf.iloc[0, 0]) | ||
for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]: | ||
self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer]) | ||
self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) | ||
self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) | ||
# self.assert_eq(psdf.iloc[psdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer]) | ||
|
||
self.assertRaisesRegex( | ||
SparkPandasNotImplementedError, | ||
".iloc requires numeric slice, conditional boolean", | ||
lambda: ps.range(10).iloc["a", :], | ||
) | ||
|
||
def test_iloc_multiindex_columns(self): | ||
arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] | ||
|
||
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]: | ||
self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer]) | ||
self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) | ||
self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) | ||
# self.assert_eq(psdf.iloc[psdf.index == "B", indexer], | ||
# pdf.iloc[pdf.index == "B", indexer]) | ||
|
||
def test_iloc_series(self): | ||
pser = pd.Series([1, 2, 3]) | ||
psser = ps.from_pandas(pser) | ||
|
||
self.assert_eq(psser.iloc[0], pser.iloc[0]) | ||
self.assert_eq(psser.iloc[:], pser.iloc[:]) | ||
self.assert_eq(psser.iloc[:1], pser.iloc[:1]) | ||
self.assert_eq(psser.iloc[:-1], pser.iloc[:-1]) | ||
|
||
self.assert_eq((psser + 1).iloc[0], (pser + 1).iloc[0]) | ||
self.assert_eq((psser + 1).iloc[:], (pser + 1).iloc[:]) | ||
self.assert_eq((psser + 1).iloc[:1], (pser + 1).iloc[:1]) | ||
self.assert_eq((psser + 1).iloc[:-1], (pser + 1).iloc[:-1]) | ||
|
||
def test_iloc_slice_rows_sel(self): | ||
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5}) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
for rows_sel in [ | ||
slice(None), | ||
slice(0, 1), | ||
slice(1, 2), | ||
slice(-3, None), | ||
slice(None, -3), | ||
slice(None, 0), | ||
slice(None, None, 3), | ||
slice(3, 8, 2), | ||
slice(None, None, -2), | ||
slice(8, 3, -2), | ||
slice(8, None, -2), | ||
slice(None, 3, -2), | ||
]: | ||
with self.subTest(rows_sel=rows_sel): | ||
self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index()) | ||
self.assert_eq( | ||
psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index() | ||
) | ||
self.assert_eq( | ||
(psdf.A + 1).iloc[rows_sel].sort_index(), | ||
(pdf.A + 1).iloc[rows_sel].sort_index(), | ||
) | ||
|
||
def test_iloc_iterable_rows_sel(self): | ||
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5}) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
for rows_sel in [ | ||
[], | ||
np.array([0, 1]), | ||
[1, 2], | ||
np.array([-3]), | ||
[3], | ||
np.array([-2]), | ||
[8, 3, -5], | ||
]: | ||
with self.subTest(rows_sel=rows_sel): | ||
self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index()) | ||
self.assert_eq( | ||
psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index() | ||
) | ||
self.assert_eq( | ||
(psdf.A + 1).iloc[rows_sel].sort_index(), | ||
(pdf.A + 1).iloc[rows_sel].sort_index(), | ||
) | ||
|
||
with self.subTest(rows_sel=rows_sel): | ||
self.assert_eq( | ||
psdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index() | ||
) | ||
|
||
with self.subTest(rows_sel=rows_sel): | ||
self.assert_eq( | ||
psdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index() | ||
) | ||
|
||
def test_frame_iloc_setitem(self): | ||
pdf = pd.DataFrame( | ||
[[1, 2], [4, 5], [7, 8]], | ||
index=["cobra", "viper", "sidewinder"], | ||
columns=["max_speed", "shield"], | ||
) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
pdf.iloc[[1, 2], [1, 0]] = 10 | ||
psdf.iloc[[1, 2], [1, 0]] = 10 | ||
self.assert_eq(psdf, pdf) | ||
|
||
pdf.iloc[0, 1] = 50 | ||
psdf.iloc[0, 1] = 50 | ||
self.assert_eq(psdf, pdf) | ||
|
||
with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): | ||
psdf.iloc[0, 0] = -psdf.max_speed | ||
with self.assertRaisesRegex(ValueError, "shape mismatch"): | ||
psdf.iloc[:, [1, 0]] = -psdf.max_speed | ||
with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"): | ||
psdf.iloc[:, 0] = psdf | ||
|
||
pdf = pd.DataFrame( | ||
[[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"] | ||
) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
pdf.iloc[:, 0] = pdf | ||
psdf.iloc[:, 0] = psdf | ||
self.assert_eq(psdf, pdf) | ||
|
||
def test_series_iloc_setitem(self): | ||
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
pser = pdf.x | ||
psery = pdf.y | ||
psser = psdf.x | ||
pssery = psdf.y | ||
|
||
piloc = pser.iloc | ||
kiloc = psser.iloc | ||
|
||
pser1 = pser + 1 | ||
psser1 = psser + 1 | ||
|
||
for key, value in [ | ||
([1, 2], 10), | ||
(1, 50), | ||
(slice(None), 10), | ||
(slice(None, 1), 20), | ||
(slice(1, None), 30), | ||
]: | ||
with self.subTest(key=key, value=value): | ||
pser.iloc[key] = value | ||
psser.iloc[key] = value | ||
self.assert_eq(psser, pser) | ||
self.assert_eq(psdf, pdf) | ||
self.assert_eq(pssery, psery) | ||
|
||
piloc[key] = -value | ||
kiloc[key] = -value | ||
self.assert_eq(psser, pser) | ||
self.assert_eq(psdf, pdf) | ||
self.assert_eq(pssery, psery) | ||
|
||
pser1.iloc[key] = value | ||
psser1.iloc[key] = value | ||
self.assert_eq(psser1, pser1) | ||
self.assert_eq(psdf, pdf) | ||
self.assert_eq(pssery, psery) | ||
|
||
with self.assertRaises(ValueError): | ||
psser.iloc[1] = -psser | ||
|
||
pser = pd.Index([1, 2, 3]).to_series() | ||
psser = ps.Index([1, 2, 3]).to_series() | ||
|
||
pser1 = pser + 1 | ||
psser1 = psser + 1 | ||
|
||
pser.iloc[0] = 10 | ||
psser.iloc[0] = 10 | ||
self.assert_eq(psser, pser) | ||
|
||
pser1.iloc[0] = 20 | ||
psser1.iloc[0] = 20 | ||
self.assert_eq(psser1, pser1) | ||
|
||
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
pser = pdf.a | ||
psser = psdf.a | ||
|
||
pser.iloc[[0, 1, 2]] = -pdf.b | ||
psser.iloc[[0, 1, 2]] = -psdf.b | ||
self.assert_eq(psser, pser) | ||
self.assert_eq(psdf, pdf) | ||
|
||
with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): | ||
psser.iloc[1] = psdf[["b"]] | ||
|
||
def test_iloc_raises(self): | ||
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) | ||
psdf = ps.from_pandas(pdf) | ||
|
||
with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"): | ||
psdf.iloc[[0, 1], [0, 1], [1, 2]] | ||
|
||
with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"): | ||
psdf.A.iloc[[0, 1], [0, 1]] | ||
|
||
with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): | ||
psdf.iloc[:"b", :] | ||
|
||
with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): | ||
psdf.iloc[:, :"b"] | ||
|
||
with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"): | ||
psdf.iloc[:, ["A"]] | ||
|
||
with self.assertRaisesRegex(ValueError, "Location based indexing can only have"): | ||
psdf.iloc[:, "A"] | ||
|
||
with self.assertRaisesRegex(IndexError, "out of range"): | ||
psdf.iloc[:, [5, 6]] | ||
|
||
|
||
class IndexingILocTests( | ||
IndexingILocMixin, | ||
PandasOnSparkTestCase, | ||
SQLTestUtils, | ||
): | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
from pyspark.pandas.tests.indexes.test_indexing_iloc import * # noqa: F401 | ||
|
||
try: | ||
import xmlrunner | ||
|
||
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) | ||
except ImportError: | ||
testRunner = None | ||
unittest.main(testRunner=testRunner, verbosity=2) |
Oops, something went wrong.