Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cache miniblock metadata #3323

Merged
merged 2 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion rust/lance-core/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
use std::any::{Any, TypeId};
use std::sync::Arc;

use deepsize::{Context, DeepSizeOf};
use futures::Future;
use moka::sync::Cache;
use object_store::path::Path;

use crate::utils::path::LancePathExt;
use crate::Result;

pub use deepsize::{Context, DeepSizeOf};

type ArcAny = Arc<dyn Any + Send + Sync>;

#[derive(Clone)]
Expand Down Expand Up @@ -121,6 +122,12 @@ impl FileMetadataCache {
}
}

/// Fetch an item from the cache, using a str as the key
pub fn get_by_str<T: Send + Sync + 'static>(&self, path: &str) -> Option<Arc<T>> {
self.get(&Path::parse(path).unwrap())
}

/// Fetch an item from the cache
pub fn get<T: Send + Sync + 'static>(&self, path: &Path) -> Option<Arc<T>> {
let cache = self.cache.as_ref()?;
let temp: Path;
Expand All @@ -135,6 +142,7 @@ impl FileMetadataCache {
.map(|metadata| metadata.record.clone().downcast::<T>().unwrap())
}

/// Insert an item into the cache
pub fn insert<T: DeepSizeOf + Send + Sync + 'static>(&self, path: Path, metadata: Arc<T>) {
let Some(cache) = self.cache.as_ref() else {
return;
Expand All @@ -147,6 +155,15 @@ impl FileMetadataCache {
cache.insert((path, TypeId::of::<T>()), SizedRecord::new(metadata));
}

/// Insert an item into the cache, using a str as the key
pub fn insert_by_str<T: DeepSizeOf + Send + Sync + 'static>(
&self,
key: &str,
metadata: Arc<T>,
) {
self.insert(Path::parse(key).unwrap(), metadata);
}

/// Get an item
///
/// If it exists in the cache return that
Expand Down
3 changes: 3 additions & 0 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ pub const COMPRESSION_LEVEL_META_KEY: &str = "lance-encoding:compression-level";
pub const BLOB_META_KEY: &str = "lance-encoding:blob";
pub const PACKED_STRUCT_LEGACY_META_KEY: &str = "packed";
pub const PACKED_STRUCT_META_KEY: &str = "lance-encoding:packed";
pub const STRUCTURAL_ENCODING_META_KEY: &str = "lance-encoding:structural-encoding";
pub const STRUCTURAL_ENCODING_MINIBLOCK: &str = "miniblock";
pub const STRUCTURAL_ENCODING_FULLZIP: &str = "fullzip";

lazy_static::lazy_static! {
pub static ref BLOB_DESC_FIELDS: Fields =
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-core/src/utils/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ impl LancePathExt for Path {
fn child_path(&self, path: &Path) -> Path {
let mut new_path = self.clone();
for part in path.parts() {
new_path = path.child(part);
new_path = new_path.child(part);
}
new_path
}
Expand Down
39 changes: 38 additions & 1 deletion rust/lance-datagen/src/generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,38 @@ impl BatchGeneratorBuilder {
}
}

/// Factory for creating a single random array
pub struct ArrayGeneratorBuilder {
generator: Box<dyn ArrayGenerator>,
seed: Option<Seed>,
}

impl ArrayGeneratorBuilder {
fn new(generator: Box<dyn ArrayGenerator>) -> Self {
Self {
generator,
seed: None,
}
}

/// Use the given seed for the generator
pub fn with_seed(mut self, seed: Seed) -> Self {
self.seed = Some(seed);
self
}

/// Generate a single array with the given length
pub fn into_array_rows(
mut self,
length: RowCount,
) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
);
self.generator.generate(length, &mut rng)
}
}

const MS_PER_DAY: i64 = 86400000;

pub mod array {
Expand Down Expand Up @@ -1858,11 +1890,16 @@ pub mod array {
}
}

/// Create a BatchGeneratorBuilder to start generating data
/// Create a BatchGeneratorBuilder to start generating batch data
pub fn gen() -> BatchGeneratorBuilder {
BatchGeneratorBuilder::default()
}

/// Create an ArrayGeneratorBuilder to start generating array data
pub fn gen_array(gen: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
ArrayGeneratorBuilder::new(gen)
}

/// Create a BatchGeneratorBuilder with the given schema
///
/// You can add more columns or convert this into a reader immediately
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ impl DataBlock {
Self::Empty() => Self::Empty(),
Self::Constant(inner) => Self::Constant(inner),
Self::AllNull(_) => panic!("Cannot remove validity on all-null data"),
Self::Nullable(inner) => *inner.data,
Self::Nullable(inner) => inner.data.remove_validity(),
Self::FixedWidth(inner) => Self::FixedWidth(inner),
Self::FixedSizeList(inner) => Self::FixedSizeList(inner.remove_validity()),
Self::VariableWidth(inner) => Self::VariableWidth(inner),
Expand Down
Loading
Loading