Eventual-Inc · desmondcheongzx · Jul 12, 2024 · Jul 9, 2024 · Jul 11, 2024 · jaychia
diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -1147,6 +1147,7 @@ class PyExpr:
     def list_min(self) -> PyExpr: ...
     def list_max(self) -> PyExpr: ...
     def list_slice(self, start: PyExpr, end: PyExpr) -> PyExpr: ...
+    def list_chunk(self, size: int) -> PyExpr: ...
     def struct_get(self, name: str) -> PyExpr: ...
     def map_get(self, key: PyExpr) -> PyExpr: ...
     def url_download(

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -2597,6 +2597,18 @@ def slice(self, start: int | Expression, end: int | Expression) -> Expression:
         end_expr = Expression._to_expression(end)
         return Expression._from_pyexpr(self._expr.list_slice(start_expr._expr, end_expr._expr))
 
+    def chunk(self, size: int) -> Expression:
+        """Splits each list into chunks of the given size
+
+        Args:
+            size: size of chunks to split the list into. Must be greater than 0
+        Returns:
+            Expression: an expression with lists of fixed size lists of the type of the list values
+        """
+        if not (isinstance(size, int) and size > 0):
+            raise ValueError(f"Invalid value for `size`: {size}")
+        return Expression._from_pyexpr(self._expr.list_chunk(size))
+
     def sum(self) -> Expression:
         """Sums each list. Empty lists and lists with all nulls yield null.
 

diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst
@@ -208,6 +208,8 @@ List
    Expression.list.join
    Expression.list.lengths
    Expression.list.get
+   Expression.list.slice
+   Expression.list.chunk
 
 Struct
 ######

diff --git a/src/daft-core/src/array/ops/list.rs b/src/daft-core/src/array/ops/list.rs
@@ -56,7 +56,7 @@ fn create_iter<'a>(arr: &'a Int64Array, len: usize) -> Box<dyn Iterator<Item = i
     }
 }
 
-pub fn get_slices_helper(
+fn get_slices_helper(
     mut parent_offsets: impl Iterator<Item = i64>,
     field: Arc<Field>,
     child_data_type: &DataType,
@@ -118,6 +118,84 @@ pub fn get_slices_helper(
     .into_series())
 }
 
+/// Helper function that gets chunks of a given `size` from each list in the Series. Discards excess
+/// elements that do not fit into the chunks.
+///
+/// This function has two paths. The first is a fast path that is taken when all lists in the
+/// Series have a length that is a multiple of `size`, which means they can be chunked cleanly
+/// without leftover elements. In the fast path, we simply pass the underlying array of elements to
+/// the result, but reinterpret it as a list of fixed sized lists.
+///
+/// If there is at least one list that cannot be chunked cleanly, the underlying array of elements
+/// has to be compacted to remove the excess elements. In this case we take the slower path that
+/// does this compaction.
+///
+///
+/// # Arguments
+///
+/// * `flat_child`  - The Series that we're extracting chunks from.
+/// * `field`       - The field of the parent list.
+/// * `validity`    - The parent list's validity.
+/// * `size`        - The size for each chunk.
+/// * `total_elements_to_skip` - The number of elements in the Series that do not fit cleanly into
+///                              chunks. We take the fast path iff this value is 0.
+/// * `to_skip`     - An optional iterator of the number of elements to skip for each list. Elements
+///                   are skipped when they cannot fit into their parent list's chunks.
+/// * `new_offsets` - The new offsets to use for the topmost list array, this is computed based on
+///                   the number of chunks extracted from each list.
+fn get_chunks_helper(
+    flat_child: &Series,
+    field: Arc<Field>,
+    validity: Option<&arrow2::bitmap::Bitmap>,
+    size: usize,
+    total_elements_to_skip: usize,
+    to_skip: Option<impl Iterator<Item = usize>>,
+    new_offsets: Vec<i64>,
+) -> DaftResult<Series> {
+    if total_elements_to_skip == 0 {
+        let inner_list_field = field.to_exploded_field()?.to_fixed_size_list_field(size)?;
+        let inner_list = FixedSizeListArray::new(
+            inner_list_field.clone(),
+            flat_child.clone(),
+            None, // Since we're creating an extra layer of lists, this layer doesn't have any
+                  // validity information. The topmost list takes the parent's validity, and the
+                  // child list is unaffected by the chunking operation and maintains its validity.
+                  // This reasoning applies to the places that follow where validity is set.
+        );
+        Ok(ListArray::new(
+            inner_list_field.to_list_field()?,
+            inner_list.into_series(),
+            arrow2::offset::OffsetsBuffer::try_from(new_offsets)?,
+            validity.cloned(), // Copy the parent's validity.
+        )
+        .into_series())
+    } else {
+        let mut growable: Box<dyn Growable> = make_growable(
+            &field.name,
+            &field.to_exploded_field()?.dtype,
+            vec![flat_child],
+            false, // There's no validity to set, see the comment above.
+            flat_child.len() - total_elements_to_skip,
+        );
+        let mut starting_idx = 0;
+        for (i, to_skip) in to_skip.unwrap().enumerate() {
+            let num_chunks = new_offsets.get(i + 1).unwrap() - new_offsets.get(i).unwrap();
+            let slice_len = num_chunks as usize * size;
+            growable.extend(0, starting_idx, slice_len);
+            starting_idx += slice_len + to_skip;
+        }
+        let inner_list_field = field.to_exploded_field()?.to_fixed_size_list_field(size)?;
+        let inner_list = FixedSizeListArray::new(inner_list_field.clone(), growable.build()?, None);
+        Ok(ListArray::new(
+            inner_list_field.to_list_field()?,
+            inner_list.into_series(),
+            arrow2::offset::OffsetsBuffer::try_from(new_offsets)?,
+            validity.cloned(), // Copy the parent's validity.
+        )
+        .into_series())
+    }
+}
+
 impl ListArray {
     pub fn count(&self, mode: CountMode) -> DaftResult<UInt64Array> {
         let counts = match (mode, self.flat_child.validity()) {
@@ -274,6 +352,34 @@ impl ListArray {
             end_iter,
         )
     }
+
+    pub fn get_chunks(&self, size: usize) -> DaftResult<Series> {
+        let mut to_skip = Vec::with_capacity(self.flat_child.len());
+        let mut new_offsets = Vec::with_capacity(self.flat_child.len() + 1);
+        let mut total_elements_to_skip = 0;
+        new_offsets.push(0);
+        for i in 0..self.offsets().len() - 1 {
+            let slice_len = self.offsets().get(i + 1).unwrap() - self.offsets().get(i).unwrap();
+            let modulo = slice_len as usize % size;
+            to_skip.push(modulo);
+            total_elements_to_skip += modulo;
+            new_offsets.push(new_offsets.last().unwrap() + (slice_len / size as i64));
+        }
+        let to_skip = if total_elements_to_skip == 0 {
+            None
+        } else {
+            Some(to_skip.iter().copied())
+        };
+        get_chunks_helper(
+            &self.flat_child,
+            self.field.clone(),
+            self.validity(),
+            size,
+            total_elements_to_skip,
+            to_skip,
+            new_offsets,
+        )
+    }
 }
 
 impl FixedSizeListArray {
@@ -420,6 +526,34 @@ impl FixedSizeListArray {
             end_iter,
         )
     }
+
+    pub fn get_chunks(&self, size: usize) -> DaftResult<Series> {
+        let list_size = self.fixed_element_len();
+        let num_chunks = list_size / size;
+        let modulo = list_size % size;
+        let total_elements_to_skip = modulo * self.len();
+        let new_offsets: Vec<i64> = if !self.is_empty() && num_chunks > 0 {
+            (0..=((self.len() * num_chunks) as i64))
+                .step_by(num_chunks)
+                .collect()
+        } else {
+            vec![0; self.len() + 1]
+        };
+        let to_skip = if total_elements_to_skip == 0 {
+            None
+        } else {
+            Some(std::iter::repeat(modulo).take(self.len()))
+        };
+        get_chunks_helper(
+            &self.flat_child,
+            self.field.clone(),
+            self.validity(),
+            size,
+            total_elements_to_skip,
+            to_skip,
+            new_offsets,
+        )
+    }
 }
 
 macro_rules! impl_aggs_list_array {

diff --git a/src/daft-core/src/datatypes/field.rs b/src/daft-core/src/datatypes/field.rs
@@ -113,6 +113,18 @@ impl Field {
         })
     }
 
+    pub fn to_fixed_size_list_field(&self, size: usize) -> DaftResult<Self> {
+        if self.dtype.is_python() {
+            return Ok(self.clone());
+        }
+        let list_dtype = DataType::FixedSizeList(Box::new(self.dtype.clone()), size);
+        Ok(Self {
+            name: self.name.clone(),
+            dtype: list_dtype,
+            metadata: self.metadata.clone(),
+        })
+    }
+
     pub fn to_exploded_field(&self) -> DaftResult<Self> {
         match &self.dtype {
             DataType::List(child_dtype) | DataType::FixedSizeList(child_dtype, _) => {

diff --git a/src/daft-core/src/series/ops/list.rs b/src/daft-core/src/series/ops/list.rs
@@ -83,6 +83,16 @@ impl Series {
         }
     }
 
+    pub fn list_chunk(&self, size: usize) -> DaftResult<Series> {
+        match self.data_type() {
+            DataType::List(_) => self.list()?.get_chunks(size),
+            DataType::FixedSizeList(..) => self.fixed_size_list()?.get_chunks(size),
+            dt => Err(DaftError::TypeError(format!(
+                "list chunk not implemented for {dt}"
+            ))),
+        }
+    }
+
     pub fn list_sum(&self) -> DaftResult<Series> {
         match self.data_type() {
             DataType::List(_) => self.list()?.sum(),

diff --git a/src/daft-dsl/src/functions/list/chunk.rs b/src/daft-dsl/src/functions/list/chunk.rs
@@ -0,0 +1,53 @@
+use crate::ExprRef;
+use daft_core::{datatypes::Field, schema::Schema, series::Series};
+
+use super::{super::FunctionEvaluator, ListExpr};
+use crate::functions::FunctionExpr;
+use common_error::{DaftError, DaftResult};
+
+pub(super) struct ChunkEvaluator {}
+
+impl FunctionEvaluator for ChunkEvaluator {
+    fn fn_name(&self) -> &'static str {
+        "chunk"
+    }
+
+    fn to_field(
+        &self,
+        inputs: &[ExprRef],
+        schema: &Schema,
+        expr: &FunctionExpr,
+    ) -> DaftResult<Field> {
+        let size = match expr {
+            FunctionExpr::List(ListExpr::Chunk(size)) => size,
+            _ => panic!("Expected Chunk Expr, got {expr}"),
+        };
+        match inputs {
+            [input] => {
+                let input_field = input.to_field(schema)?;
+                Ok(input_field
+                    .to_exploded_field()?
+                    .to_fixed_size_list_field(*size)?
+                    .to_list_field()?)
+            }
+            _ => Err(DaftError::SchemaMismatch(format!(
+                "Expected 1 input args, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+
+    fn evaluate(&self, inputs: &[Series], expr: &FunctionExpr) -> DaftResult<Series> {
+        let size = match expr {
+            FunctionExpr::List(ListExpr::Chunk(size)) => size,
+            _ => panic!("Expected Chunk Expr, got {expr}"),
+        };
+        match inputs {
+            [input] => input.list_chunk(*size),
+            _ => Err(DaftError::ValueError(format!(
+                "Expected 1 input args, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+}
diff --git a/src/daft-dsl/src/functions/list/mod.rs b/src/daft-dsl/src/functions/list/mod.rs
@@ -1,3 +1,4 @@
+mod chunk;
 mod count;
 mod explode;
 mod get;
@@ -8,6 +9,7 @@ mod min;
 mod slice;
 mod sum;
 
+use chunk::ChunkEvaluator;
 use count::CountEvaluator;
 use daft_core::CountMode;
 use explode::ExplodeEvaluator;
@@ -35,6 +37,7 @@ pub enum ListExpr {
     Min,
     Max,
     Slice,
+    Chunk(usize),
 }
 
 impl ListExpr {
@@ -51,6 +54,7 @@ impl ListExpr {
             Min => &MinEvaluator {},
             Max => &MaxEvaluator {},
             Slice => &SliceEvaluator {},
+            Chunk(_) => &ChunkEvaluator {},
         }
     }
 }
@@ -126,3 +130,11 @@ pub fn slice(input: ExprRef, start: ExprRef, end: ExprRef) -> ExprRef {
     }
     .into()
 }
+
+pub fn chunk(input: ExprRef, size: usize) -> ExprRef {
+    Expr::Function {
+        func: super::FunctionExpr::List(ListExpr::Chunk(size)),
+        inputs: vec![input],
+    }
+    .into()
+}
diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs
@@ -799,6 +799,11 @@ impl PyExpr {
         Ok(slice(self.into(), start.into(), end.into()).into())
     }
 
+    pub fn list_chunk(&self, size: usize) -> PyResult<Self> {
+        use crate::functions::list::chunk;
+        Ok(chunk(self.into(), size).into())
+    }
+
     pub fn struct_get(&self, name: &str) -> PyResult<Self> {
         use crate::functions::struct_::get;
         Ok(get(self.into(), name).into())