Skip to content

Commit

Permalink
[SPARK-46517][PS][TESTS][FOLLOWUPS] Reorganize IndexingTest: factor…
Browse files Browse the repository at this point in the history
… out `test_iloc*`

### What changes were proposed in this pull request?
factor out `test_iloc*` and add the missing parity test

### Why are the changes needed?
for test parity and parallelism

### Does this PR introduce _any_ user-facing change?
no, test-only

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes apache#44510 from zhengruifeng/ps_test_indexing_iloc.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
  • Loading branch information
zhengruifeng authored and yaooqinn committed Dec 27, 2023
1 parent 3f08c9f commit 7cad075
Show file tree
Hide file tree
Showing 4 changed files with 346 additions and 217 deletions.
2 changes: 2 additions & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,7 @@ def __hash__(self):
"pyspark.pandas.tests.indexes.test_align",
"pyspark.pandas.tests.indexes.test_indexing",
"pyspark.pandas.tests.indexes.test_indexing_basic",
"pyspark.pandas.tests.indexes.test_indexing_iloc",
"pyspark.pandas.tests.indexes.test_indexing_loc",
"pyspark.pandas.tests.indexes.test_indexing_loc_multi_idx",
"pyspark.pandas.tests.indexes.test_reindex",
Expand Down Expand Up @@ -1092,6 +1093,7 @@ def __hash__(self):
"pyspark.pandas.tests.connect.indexes.test_parity_align",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_multi_idx",
"pyspark.pandas.tests.connect.indexes.test_parity_reindex",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.indexes.test_indexing_iloc import IndexingILocMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class IndexingILocParityTests(
IndexingILocMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
303 changes: 303 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_indexing_iloc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,303 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

import numpy as np
import pandas as pd

from pyspark import pandas as ps
from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils


class IndexingILocMixin:
@property
def pdf(self):
return pd.DataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
)

@property
def pdf2(self):
return pd.DataFrame(
{0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]},
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
)

@property
def psdf(self):
return ps.from_pandas(self.pdf)

@property
def psdf2(self):
return ps.from_pandas(self.pdf2)

def test_iloc(self):
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
psdf = ps.from_pandas(pdf)

self.assert_eq(psdf.iloc[0, 0], pdf.iloc[0, 0])
for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]:
self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer])
self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
# self.assert_eq(psdf.iloc[psdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer])

self.assertRaisesRegex(
SparkPandasNotImplementedError,
".iloc requires numeric slice, conditional boolean",
lambda: ps.range(10).iloc["a", :],
)

def test_iloc_multiindex_columns(self):
arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]

pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
psdf = ps.from_pandas(pdf)

for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]:
self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer])
self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
# self.assert_eq(psdf.iloc[psdf.index == "B", indexer],
# pdf.iloc[pdf.index == "B", indexer])

def test_iloc_series(self):
pser = pd.Series([1, 2, 3])
psser = ps.from_pandas(pser)

self.assert_eq(psser.iloc[0], pser.iloc[0])
self.assert_eq(psser.iloc[:], pser.iloc[:])
self.assert_eq(psser.iloc[:1], pser.iloc[:1])
self.assert_eq(psser.iloc[:-1], pser.iloc[:-1])

self.assert_eq((psser + 1).iloc[0], (pser + 1).iloc[0])
self.assert_eq((psser + 1).iloc[:], (pser + 1).iloc[:])
self.assert_eq((psser + 1).iloc[:1], (pser + 1).iloc[:1])
self.assert_eq((psser + 1).iloc[:-1], (pser + 1).iloc[:-1])

def test_iloc_slice_rows_sel(self):
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
psdf = ps.from_pandas(pdf)

for rows_sel in [
slice(None),
slice(0, 1),
slice(1, 2),
slice(-3, None),
slice(None, -3),
slice(None, 0),
slice(None, None, 3),
slice(3, 8, 2),
slice(None, None, -2),
slice(8, 3, -2),
slice(8, None, -2),
slice(None, 3, -2),
]:
with self.subTest(rows_sel=rows_sel):
self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
self.assert_eq(
psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index()
)
self.assert_eq(
(psdf.A + 1).iloc[rows_sel].sort_index(),
(pdf.A + 1).iloc[rows_sel].sort_index(),
)

def test_iloc_iterable_rows_sel(self):
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
psdf = ps.from_pandas(pdf)

for rows_sel in [
[],
np.array([0, 1]),
[1, 2],
np.array([-3]),
[3],
np.array([-2]),
[8, 3, -5],
]:
with self.subTest(rows_sel=rows_sel):
self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
self.assert_eq(
psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index()
)
self.assert_eq(
(psdf.A + 1).iloc[rows_sel].sort_index(),
(pdf.A + 1).iloc[rows_sel].sort_index(),
)

with self.subTest(rows_sel=rows_sel):
self.assert_eq(
psdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index()
)

with self.subTest(rows_sel=rows_sel):
self.assert_eq(
psdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index()
)

def test_frame_iloc_setitem(self):
pdf = pd.DataFrame(
[[1, 2], [4, 5], [7, 8]],
index=["cobra", "viper", "sidewinder"],
columns=["max_speed", "shield"],
)
psdf = ps.from_pandas(pdf)

pdf.iloc[[1, 2], [1, 0]] = 10
psdf.iloc[[1, 2], [1, 0]] = 10
self.assert_eq(psdf, pdf)

pdf.iloc[0, 1] = 50
psdf.iloc[0, 1] = 50
self.assert_eq(psdf, pdf)

with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
psdf.iloc[0, 0] = -psdf.max_speed
with self.assertRaisesRegex(ValueError, "shape mismatch"):
psdf.iloc[:, [1, 0]] = -psdf.max_speed
with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"):
psdf.iloc[:, 0] = psdf

pdf = pd.DataFrame(
[[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"]
)
psdf = ps.from_pandas(pdf)

pdf.iloc[:, 0] = pdf
psdf.iloc[:, 0] = psdf
self.assert_eq(psdf, pdf)

def test_series_iloc_setitem(self):
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
psdf = ps.from_pandas(pdf)

pser = pdf.x
psery = pdf.y
psser = psdf.x
pssery = psdf.y

piloc = pser.iloc
kiloc = psser.iloc

pser1 = pser + 1
psser1 = psser + 1

for key, value in [
([1, 2], 10),
(1, 50),
(slice(None), 10),
(slice(None, 1), 20),
(slice(1, None), 30),
]:
with self.subTest(key=key, value=value):
pser.iloc[key] = value
psser.iloc[key] = value
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
self.assert_eq(pssery, psery)

piloc[key] = -value
kiloc[key] = -value
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
self.assert_eq(pssery, psery)

pser1.iloc[key] = value
psser1.iloc[key] = value
self.assert_eq(psser1, pser1)
self.assert_eq(psdf, pdf)
self.assert_eq(pssery, psery)

with self.assertRaises(ValueError):
psser.iloc[1] = -psser

pser = pd.Index([1, 2, 3]).to_series()
psser = ps.Index([1, 2, 3]).to_series()

pser1 = pser + 1
psser1 = psser + 1

pser.iloc[0] = 10
psser.iloc[0] = 10
self.assert_eq(psser, pser)

pser1.iloc[0] = 20
psser1.iloc[0] = 20
self.assert_eq(psser1, pser1)

pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
psdf = ps.from_pandas(pdf)

pser = pdf.a
psser = psdf.a

pser.iloc[[0, 1, 2]] = -pdf.b
psser.iloc[[0, 1, 2]] = -psdf.b
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)

with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
psser.iloc[1] = psdf[["b"]]

def test_iloc_raises(self):
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
psdf = ps.from_pandas(pdf)

with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"):
psdf.iloc[[0, 1], [0, 1], [1, 2]]

with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"):
psdf.A.iloc[[0, 1], [0, 1]]

with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
psdf.iloc[:"b", :]

with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
psdf.iloc[:, :"b"]

with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"):
psdf.iloc[:, ["A"]]

with self.assertRaisesRegex(ValueError, "Location based indexing can only have"):
psdf.iloc[:, "A"]

with self.assertRaisesRegex(IndexError, "out of range"):
psdf.iloc[:, [5, 6]]


class IndexingILocTests(
IndexingILocMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.indexes.test_indexing_iloc import * # noqa: F401

try:
import xmlrunner

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Loading

0 comments on commit 7cad075

Please sign in to comment.