Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement support for list of Dictionaries #664

Merged
merged 3 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions rust/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ use std::collections::HashMap;
use std::fmt::Formatter;
use std::fmt::{self};

use arrow_array::cast::{as_large_list_array, as_list_array};
use arrow_array::types::{
Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
use arrow_array::{cast::as_dictionary_array, Array, ArrayRef, RecordBatch, StructArray};
use arrow_array::{
cast::as_dictionary_array, Array, ArrayRef, LargeListArray, ListArray, RecordBatch, StructArray,
};
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema, TimeUnit};
use async_recursion::async_recursion;

Expand Down Expand Up @@ -375,8 +378,16 @@ impl Field {
lance_field.set_dictionary(struct_arr.column(i));
}
}
DataType::List(_) => {
let list_arr = as_list_array(arr);
self.children[0].set_dictionary(list_arr.values());
}
DataType::LargeList(_) => {
let list_arr = as_large_list_array(arr);
self.children[0].set_dictionary(list_arr.values());
}
_ => {
// Add list / large list support.
// Field types that don't support dictionaries
}
}
}
Expand Down
70 changes: 69 additions & 1 deletion rust/src/io/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub async fn write_manifest(
) -> Result<usize> {
// Write dictionary values.
let max_field_id = manifest.schema.max_field_id().unwrap_or(-1);
for field_id in 1..max_field_id + 1 {
for field_id in 0..max_field_id + 1 {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could potentially be (0..max_field_id), but mut_field_by_id will return None when the id does not exist

if let Some(field) = manifest.schema.mut_field_by_id(field_id) {
if field.data_type().is_dictionary() {
let dict_info = field.dictionary.as_mut().ok_or_else(|| {
Expand Down Expand Up @@ -414,6 +414,24 @@ mod tests {
DataType::LargeList(Box::new(ArrowField::new("item", DataType::Utf8, true))),
true,
),
ArrowField::new(
"l_dict",
DataType::List(Box::new(ArrowField::new(
"item",
DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
true,
))),
true,
),
ArrowField::new(
"large_l_dict",
DataType::LargeList(Box::new(ArrowField::new(
"item",
DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
true,
))),
true,
),
ArrowField::new(
"s",
DataType::Struct(vec![
Expand Down Expand Up @@ -452,6 +470,24 @@ mod tests {
let large_list_arr =
LargeListArray::try_new(large_list_values, &large_list_offsets).unwrap();

let list_dict_offsets = (0..202).step_by(2).collect();
let list_dict_vec = (0..200)
.into_iter()
.map(|n| ["a", "b", "c"][n % 3])
.collect::<Vec<_>>();
let list_dict_arr: DictionaryArray<UInt32Type> = list_dict_vec.into_iter().collect();
let list_dict_arr = ListArray::try_new(list_dict_arr, &list_dict_offsets).unwrap();

let large_list_dict_offsets: Int64Array = (0..202).step_by(2).collect();
let large_list_dict_vec = (0..200)
.into_iter()
.map(|n| ["a", "b", "c"][n % 3])
.collect::<Vec<_>>();
let large_list_dict_arr: DictionaryArray<UInt32Type> =
large_list_dict_vec.into_iter().collect();
let large_list_dict_arr =
LargeListArray::try_new(large_list_dict_arr, &large_list_dict_offsets).unwrap();

let columns: Vec<ArrayRef> = vec![
Arc::new(NullArray::new(100)),
Arc::new(BooleanArray::from_iter(
Expand Down Expand Up @@ -491,6 +527,8 @@ mod tests {
Arc::new(fixed_size_binary_arr),
Arc::new(list_arr),
Arc::new(large_list_arr),
Arc::new(list_dict_arr),
Arc::new(large_list_dict_arr),
Arc::new(StructArray::from(vec![
(
ArrowField::new("si", DataType::Int64, true),
Expand Down Expand Up @@ -518,6 +556,36 @@ mod tests {
assert_eq!(actual, batch);
}

#[tokio::test]
async fn test_dictionary_first_element_file() {
let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
"d",
DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
true,
)]);
let mut schema = Schema::try_from(&arrow_schema).unwrap();

let dict_vec = (0..100)
.into_iter()
.map(|n| ["a", "b", "c"][n % 3])
.collect::<Vec<_>>();
let dict_arr: DictionaryArray<UInt32Type> = dict_vec.into_iter().collect();

let columns: Vec<ArrayRef> = vec![Arc::new(dict_arr)];
let batch = RecordBatch::try_new(Arc::new(arrow_schema), columns).unwrap();
schema.set_dictionary(&batch).unwrap();

let store = ObjectStore::memory();
let path = Path::from("/foo");
let mut file_writer = FileWriter::try_new(&store, &path, &schema).await.unwrap();
file_writer.write(&[&batch]).await.unwrap();
file_writer.finish().await.unwrap();

let reader = FileReader::try_new(&store, &path).await.unwrap();
let actual = reader.read_batch(0, .., reader.schema()).await.unwrap();
assert_eq!(actual, batch);
}

#[tokio::test]
async fn test_write_temporal_types() {
let arrow_schema = Arc::new(ArrowSchema::new(vec![
Expand Down