From 0cbf1cd25733bb6835e244e0e1883eec6dc1df17 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Wed, 9 Nov 2022 18:36:26 +0000 Subject: [PATCH 1/2] when writing in batches, handle all na arrays properly --- python/lance/data/convert/base.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/lance/data/convert/base.py b/python/lance/data/convert/base.py index b10065ab48..69fad6b182 100644 --- a/python/lance/data/convert/base.py +++ b/python/lance/data/convert/base.py @@ -106,9 +106,11 @@ def _convert_field(self, name, typ, col): """pyarrow is unable to convert ExtensionTypes properly in pa.Table.from_pandas""" if isinstance(typ, pa.ExtensionType): storage = pa.array(col, type=typ.storage_type) - arr = pa.ExtensionArray.from_storage(typ, storage) + return pa.ExtensionArray.from_storage(typ, storage) elif pa.types.is_list(typ): native_arr = pa.array(col) + if isinstance(native_arr, pa.NullArray): + return pa.nulls(len(native_arr), typ) offsets = native_arr.offsets values = native_arr.values.to_numpy(zero_copy_only=False) return pa.ListArray.from_arrays( @@ -116,6 +118,8 @@ def _convert_field(self, name, typ, col): ) elif pa.types.is_struct(typ): native_arr = pa.array(col) + if isinstance(native_arr, pa.NullArray): + return pa.nulls(len(native_arr), typ) arrays = [] for subfield in typ: sub_arr = native_arr.field(subfield.name) @@ -127,8 +131,7 @@ def _convert_field(self, name, typ, col): arrays.append(converted) return pa.StructArray.from_arrays(arrays, fields=typ) else: - arr = pa.array(col, type=typ) - return arr + return pa.array(col, type=typ) def make_embedded_dataset( self, From 90297a4127ce3ec472362a0da9be8681ecf06501 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Thu, 10 Nov 2022 17:17:21 -0800 Subject: [PATCH 2/2] address pr comments --- .../tests/data/convert/test_oxford_pet.py | 23 +++++++++++++++++++ python/tools/test_linux.sh | 10 ++++++++ 2 files changed, 33 insertions(+) create mode 100755 python/tools/test_linux.sh diff --git a/python/lance/tests/data/convert/test_oxford_pet.py b/python/lance/tests/data/convert/test_oxford_pet.py index 62b4c38301..339daf48f9 100644 --- a/python/lance/tests/data/convert/test_oxford_pet.py +++ b/python/lance/tests/data/convert/test_oxford_pet.py @@ -10,6 +10,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np +import pyarrow as pa from lance.data.convert.oxford_pet import OxfordPetConverter @@ -23,3 +25,24 @@ def test_basic(tmp_path): df = c.read_metadata(num_rows) c.make_embedded_dataset(df, fmt="lance", output_path=str(tmp_path / "oxford_pet.lance")) c.make_embedded_dataset(df, fmt="parquet", output_path=str(tmp_path / "oxford_pet.parquet")) + + +# when writing iteratively sometimes we get all NAs in a column +def test_na(tmp_path): + c = OxfordPetConverter( + uri_root="s3://eto-public/datasets/oxford_pet", + images_root="https://eto-public.s3.us-west-2.amazonaws.com/datasets/oxford_pet/" + ) + name = "null_struct" + typ = pa.struct([pa.field("name", pa.string())]) + col = np.array([None, None]) + arr = c._convert_field(name, typ, col) + assert arr.type == typ + assert arr.is_null().to_numpy(False).all() + + name = "null_list" + typ = pa.list_(pa.string()) + col = np.array([None, None]) + arr = c._convert_field(name, typ, col) + assert arr.type == typ + assert arr.is_null().to_numpy(False).all() diff --git a/python/tools/test_linux.sh b/python/tools/test_linux.sh new file mode 100755 index 0000000000..a3c8085760 --- /dev/null +++ b/python/tools/test_linux.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -e + +MINOR_VERSION=${1:-cp310} + +sudo rm -rf wheels +./tools/build_wheel.sh $MINOR_VERSION +pip install --force-reinstall wheels/*.whl +pytest lance/tests \ No newline at end of file