diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 7b34bbf25bb63..4e24eb7b71f1b 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -817,6 +817,7 @@ def __hash__(self): "pyspark.pandas.tests.indexes.test_align", "pyspark.pandas.tests.indexes.test_indexing", "pyspark.pandas.tests.indexes.test_indexing_basic", + "pyspark.pandas.tests.indexes.test_indexing_iloc", "pyspark.pandas.tests.indexes.test_indexing_loc", "pyspark.pandas.tests.indexes.test_indexing_loc_multi_idx", "pyspark.pandas.tests.indexes.test_reindex", @@ -1092,6 +1093,7 @@ def __hash__(self): "pyspark.pandas.tests.connect.indexes.test_parity_align", "pyspark.pandas.tests.connect.indexes.test_parity_indexing", "pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic", + "pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc", "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc", "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_multi_idx", "pyspark.pandas.tests.connect.indexes.test_parity_reindex", diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_iloc.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_iloc.py new file mode 100644 index 0000000000000..61c36bd113b6e --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_iloc.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_indexing_iloc import IndexingILocMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class IndexingILocParityTests( + IndexingILocMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc import * # noqa + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_indexing_iloc.py b/python/pyspark/pandas/tests/indexes/test_indexing_iloc.py new file mode 100644 index 0000000000000..c864011d2b0fd --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_indexing_iloc.py @@ -0,0 +1,303 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils + + +class IndexingILocMixin: + @property + def pdf(self): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) + + @property + def pdf2(self): + return pd.DataFrame( + {0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) + + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + @property + def psdf2(self): + return ps.from_pandas(self.pdf2) + + def test_iloc(self): + pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + psdf = ps.from_pandas(pdf) + + self.assert_eq(psdf.iloc[0, 0], pdf.iloc[0, 0]) + for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]: + self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer]) + self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) + self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) + # self.assert_eq(psdf.iloc[psdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer]) + + self.assertRaisesRegex( + SparkPandasNotImplementedError, + ".iloc requires numeric slice, conditional boolean", + lambda: ps.range(10).iloc["a", :], + ) + + def test_iloc_multiindex_columns(self): + arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] + + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) + psdf = ps.from_pandas(pdf) + + for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]: + self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer]) + self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) + self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) + # self.assert_eq(psdf.iloc[psdf.index == "B", indexer], + # pdf.iloc[pdf.index == "B", indexer]) + + def test_iloc_series(self): + pser = pd.Series([1, 2, 3]) + psser = ps.from_pandas(pser) + + self.assert_eq(psser.iloc[0], pser.iloc[0]) + self.assert_eq(psser.iloc[:], pser.iloc[:]) + self.assert_eq(psser.iloc[:1], pser.iloc[:1]) + self.assert_eq(psser.iloc[:-1], pser.iloc[:-1]) + + self.assert_eq((psser + 1).iloc[0], (pser + 1).iloc[0]) + self.assert_eq((psser + 1).iloc[:], (pser + 1).iloc[:]) + self.assert_eq((psser + 1).iloc[:1], (pser + 1).iloc[:1]) + self.assert_eq((psser + 1).iloc[:-1], (pser + 1).iloc[:-1]) + + def test_iloc_slice_rows_sel(self): + pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5}) + psdf = ps.from_pandas(pdf) + + for rows_sel in [ + slice(None), + slice(0, 1), + slice(1, 2), + slice(-3, None), + slice(None, -3), + slice(None, 0), + slice(None, None, 3), + slice(3, 8, 2), + slice(None, None, -2), + slice(8, 3, -2), + slice(8, None, -2), + slice(None, 3, -2), + ]: + with self.subTest(rows_sel=rows_sel): + self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index()) + self.assert_eq( + psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index() + ) + self.assert_eq( + (psdf.A + 1).iloc[rows_sel].sort_index(), + (pdf.A + 1).iloc[rows_sel].sort_index(), + ) + + def test_iloc_iterable_rows_sel(self): + pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5}) + psdf = ps.from_pandas(pdf) + + for rows_sel in [ + [], + np.array([0, 1]), + [1, 2], + np.array([-3]), + [3], + np.array([-2]), + [8, 3, -5], + ]: + with self.subTest(rows_sel=rows_sel): + self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index()) + self.assert_eq( + psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index() + ) + self.assert_eq( + (psdf.A + 1).iloc[rows_sel].sort_index(), + (pdf.A + 1).iloc[rows_sel].sort_index(), + ) + + with self.subTest(rows_sel=rows_sel): + self.assert_eq( + psdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index() + ) + + with self.subTest(rows_sel=rows_sel): + self.assert_eq( + psdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index() + ) + + def test_frame_iloc_setitem(self): + pdf = pd.DataFrame( + [[1, 2], [4, 5], [7, 8]], + index=["cobra", "viper", "sidewinder"], + columns=["max_speed", "shield"], + ) + psdf = ps.from_pandas(pdf) + + pdf.iloc[[1, 2], [1, 0]] = 10 + psdf.iloc[[1, 2], [1, 0]] = 10 + self.assert_eq(psdf, pdf) + + pdf.iloc[0, 1] = 50 + psdf.iloc[0, 1] = 50 + self.assert_eq(psdf, pdf) + + with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): + psdf.iloc[0, 0] = -psdf.max_speed + with self.assertRaisesRegex(ValueError, "shape mismatch"): + psdf.iloc[:, [1, 0]] = -psdf.max_speed + with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"): + psdf.iloc[:, 0] = psdf + + pdf = pd.DataFrame( + [[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"] + ) + psdf = ps.from_pandas(pdf) + + pdf.iloc[:, 0] = pdf + psdf.iloc[:, 0] = psdf + self.assert_eq(psdf, pdf) + + def test_series_iloc_setitem(self): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) + psdf = ps.from_pandas(pdf) + + pser = pdf.x + psery = pdf.y + psser = psdf.x + pssery = psdf.y + + piloc = pser.iloc + kiloc = psser.iloc + + pser1 = pser + 1 + psser1 = psser + 1 + + for key, value in [ + ([1, 2], 10), + (1, 50), + (slice(None), 10), + (slice(None, 1), 20), + (slice(1, None), 30), + ]: + with self.subTest(key=key, value=value): + pser.iloc[key] = value + psser.iloc[key] = value + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) + self.assert_eq(pssery, psery) + + piloc[key] = -value + kiloc[key] = -value + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) + self.assert_eq(pssery, psery) + + pser1.iloc[key] = value + psser1.iloc[key] = value + self.assert_eq(psser1, pser1) + self.assert_eq(psdf, pdf) + self.assert_eq(pssery, psery) + + with self.assertRaises(ValueError): + psser.iloc[1] = -psser + + pser = pd.Index([1, 2, 3]).to_series() + psser = ps.Index([1, 2, 3]).to_series() + + pser1 = pser + 1 + psser1 = psser + 1 + + pser.iloc[0] = 10 + psser.iloc[0] = 10 + self.assert_eq(psser, pser) + + pser1.iloc[0] = 20 + psser1.iloc[0] = 20 + self.assert_eq(psser1, pser1) + + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + psdf = ps.from_pandas(pdf) + + pser = pdf.a + psser = psdf.a + + pser.iloc[[0, 1, 2]] = -pdf.b + psser.iloc[[0, 1, 2]] = -psdf.b + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) + + with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): + psser.iloc[1] = psdf[["b"]] + + def test_iloc_raises(self): + pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + psdf = ps.from_pandas(pdf) + + with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"): + psdf.iloc[[0, 1], [0, 1], [1, 2]] + + with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"): + psdf.A.iloc[[0, 1], [0, 1]] + + with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): + psdf.iloc[:"b", :] + + with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): + psdf.iloc[:, :"b"] + + with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"): + psdf.iloc[:, ["A"]] + + with self.assertRaisesRegex(ValueError, "Location based indexing can only have"): + psdf.iloc[:, "A"] + + with self.assertRaisesRegex(IndexError, "out of range"): + psdf.iloc[:, [5, 6]] + + +class IndexingILocTests( + IndexingILocMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_indexing_iloc import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index e90db26223bc1..ef496c3b55650 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -497,223 +497,6 @@ def test_iloc(self): lambda: ps.range(10).iloc["a", :], ) - def test_iloc_multiindex_columns(self): - arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] - - pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) - psdf = ps.from_pandas(pdf) - - for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]: - self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer]) - self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer]) - self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer]) - # self.assert_eq(psdf.iloc[psdf.index == "B", indexer], - # pdf.iloc[pdf.index == "B", indexer]) - - def test_iloc_series(self): - pser = pd.Series([1, 2, 3]) - psser = ps.from_pandas(pser) - - self.assert_eq(psser.iloc[0], pser.iloc[0]) - self.assert_eq(psser.iloc[:], pser.iloc[:]) - self.assert_eq(psser.iloc[:1], pser.iloc[:1]) - self.assert_eq(psser.iloc[:-1], pser.iloc[:-1]) - - self.assert_eq((psser + 1).iloc[0], (pser + 1).iloc[0]) - self.assert_eq((psser + 1).iloc[:], (pser + 1).iloc[:]) - self.assert_eq((psser + 1).iloc[:1], (pser + 1).iloc[:1]) - self.assert_eq((psser + 1).iloc[:-1], (pser + 1).iloc[:-1]) - - def test_iloc_slice_rows_sel(self): - pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5}) - psdf = ps.from_pandas(pdf) - - for rows_sel in [ - slice(None), - slice(0, 1), - slice(1, 2), - slice(-3, None), - slice(None, -3), - slice(None, 0), - slice(None, None, 3), - slice(3, 8, 2), - slice(None, None, -2), - slice(8, 3, -2), - slice(8, None, -2), - slice(None, 3, -2), - ]: - with self.subTest(rows_sel=rows_sel): - self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index()) - self.assert_eq( - psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index() - ) - self.assert_eq( - (psdf.A + 1).iloc[rows_sel].sort_index(), - (pdf.A + 1).iloc[rows_sel].sort_index(), - ) - - def test_iloc_iterable_rows_sel(self): - pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5}) - psdf = ps.from_pandas(pdf) - - for rows_sel in [ - [], - np.array([0, 1]), - [1, 2], - np.array([-3]), - [3], - np.array([-2]), - [8, 3, -5], - ]: - with self.subTest(rows_sel=rows_sel): - self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index()) - self.assert_eq( - psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index() - ) - self.assert_eq( - (psdf.A + 1).iloc[rows_sel].sort_index(), - (pdf.A + 1).iloc[rows_sel].sort_index(), - ) - - with self.subTest(rows_sel=rows_sel): - self.assert_eq( - psdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index() - ) - - with self.subTest(rows_sel=rows_sel): - self.assert_eq( - psdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index() - ) - - def test_frame_iloc_setitem(self): - pdf = pd.DataFrame( - [[1, 2], [4, 5], [7, 8]], - index=["cobra", "viper", "sidewinder"], - columns=["max_speed", "shield"], - ) - psdf = ps.from_pandas(pdf) - - pdf.iloc[[1, 2], [1, 0]] = 10 - psdf.iloc[[1, 2], [1, 0]] = 10 - self.assert_eq(psdf, pdf) - - pdf.iloc[0, 1] = 50 - psdf.iloc[0, 1] = 50 - self.assert_eq(psdf, pdf) - - with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): - psdf.iloc[0, 0] = -psdf.max_speed - with self.assertRaisesRegex(ValueError, "shape mismatch"): - psdf.iloc[:, [1, 0]] = -psdf.max_speed - with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"): - psdf.iloc[:, 0] = psdf - - pdf = pd.DataFrame( - [[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"] - ) - psdf = ps.from_pandas(pdf) - - pdf.iloc[:, 0] = pdf - psdf.iloc[:, 0] = psdf - self.assert_eq(psdf, pdf) - - def test_series_iloc_setitem(self): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) - psdf = ps.from_pandas(pdf) - - pser = pdf.x - psery = pdf.y - psser = psdf.x - pssery = psdf.y - - piloc = pser.iloc - kiloc = psser.iloc - - pser1 = pser + 1 - psser1 = psser + 1 - - for key, value in [ - ([1, 2], 10), - (1, 50), - (slice(None), 10), - (slice(None, 1), 20), - (slice(1, None), 30), - ]: - with self.subTest(key=key, value=value): - pser.iloc[key] = value - psser.iloc[key] = value - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) - self.assert_eq(pssery, psery) - - piloc[key] = -value - kiloc[key] = -value - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) - self.assert_eq(pssery, psery) - - pser1.iloc[key] = value - psser1.iloc[key] = value - self.assert_eq(psser1, pser1) - self.assert_eq(psdf, pdf) - self.assert_eq(pssery, psery) - - with self.assertRaises(ValueError): - psser.iloc[1] = -psser - - pser = pd.Index([1, 2, 3]).to_series() - psser = ps.Index([1, 2, 3]).to_series() - - pser1 = pser + 1 - psser1 = psser + 1 - - pser.iloc[0] = 10 - psser.iloc[0] = 10 - self.assert_eq(psser, pser) - - pser1.iloc[0] = 20 - psser1.iloc[0] = 20 - self.assert_eq(psser1, pser1) - - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - psdf = ps.from_pandas(pdf) - - pser = pdf.a - psser = psdf.a - - pser.iloc[[0, 1, 2]] = -pdf.b - psser.iloc[[0, 1, 2]] = -psdf.b - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) - - with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): - psser.iloc[1] = psdf[["b"]] - - def test_iloc_raises(self): - pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - psdf = ps.from_pandas(pdf) - - with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"): - psdf.iloc[[0, 1], [0, 1], [1, 2]] - - with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"): - psdf.A.iloc[[0, 1], [0, 1]] - - with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): - psdf.iloc[:"b", :] - - with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"): - psdf.iloc[:, :"b"] - - with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"): - psdf.iloc[:, ["A"]] - - with self.assertRaisesRegex(ValueError, "Location based indexing can only have"): - psdf.iloc[:, "A"] - - with self.assertRaisesRegex(IndexError, "out of range"): - psdf.iloc[:, [5, 6]] - def test_index_operator_datetime(self): dates = pd.date_range("20130101", periods=6) pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))