From 4aea9b097fb08e504cdfc4a7c3b7511a308dc074 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 28 Nov 2023 16:15:58 +0100 Subject: [PATCH] feat: add the`diff::resource_cache()` low-level utility for rapid in-memory diffing of combinations of resources. We also add the `object::tree::diff::Platform::for_each_to_obtain_tree_with_cache()` to pass a resource-cache for re-use between multiple invocation for significant savings. --- gix/src/config/cache/access.rs | 98 +++++++++++++++++-- gix/src/config/mod.rs | 30 ++++++ gix/src/config/tree/sections/core.rs | 4 + gix/src/config/tree/sections/diff.rs | 68 ++++++++++++- gix/src/diff.rs | 64 +++++++++++- gix/src/object/tree/diff/for_each.rs | 69 +++++++++++-- gix/tests/config/tree.rs | 31 ++++++ gix/tests/diff/mod.rs | 46 +++++++++ .../generated-archives/make_diff_repo.tar.xz | 4 +- gix/tests/fixtures/make_diff_repo.sh | 18 ++++ gix/tests/gix.rs | 2 + src/plumbing/progress.rs | 4 - 12 files changed, 417 insertions(+), 21 deletions(-) create mode 100644 gix/tests/diff/mod.rs diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index 5f9f71887fa..feeb241ba30 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -39,6 +39,97 @@ impl Cache { .copied() } + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_drivers(&self) -> Result, config::diff::drivers::Error> { + use crate::config::cache::util::ApplyLeniencyDefault; + let mut out = Vec::::new(); + for section in self + .resolved + .sections_by_name("diff") + .into_iter() + .flatten() + .filter(|s| (self.filter_config_section)(s.meta())) + { + let Some(name) = section.header().subsection_name().filter(|n| !n.is_empty()) else { + continue; + }; + + let driver = match out.iter_mut().find(|d| d.name == name) { + Some(existing) => existing, + None => { + out.push(gix_diff::blob::Driver { + name: name.into(), + ..Default::default() + }); + out.last_mut().expect("just pushed") + } + }; + + if let Some(binary) = section.value_implicit("binary") { + driver.is_binary = config::tree::Diff::DRIVER_BINARY + .try_into_binary(binary) + .with_leniency(self.lenient_config) + .map_err(|err| config::diff::drivers::Error { + name: driver.name.clone(), + attribute: "binary", + source: Box::new(err), + })?; + } + if let Some(command) = section.value(config::tree::Diff::DRIVER_COMMAND.name) { + driver.command = command.into_owned().into(); + } + if let Some(textconv) = section.value(config::tree::Diff::DRIVER_TEXTCONV.name) { + driver.binary_to_text_command = textconv.into_owned().into(); + } + if let Some(algorithm) = section.value("algorithm") { + driver.algorithm = config::tree::Diff::DRIVER_ALGORITHM + .try_into_algorithm(algorithm) + .or_else(|err| match err { + config::diff::algorithm::Error::Unimplemented { .. } if self.lenient_config => { + Ok(gix_diff::blob::Algorithm::Histogram) + } + err => Err(err), + }) + .with_lenient_default(self.lenient_config) + .map_err(|err| config::diff::drivers::Error { + name: driver.name.clone(), + attribute: "algorithm", + source: Box::new(err), + })? + .into(); + } + } + Ok(out) + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_pipeline_options( + &self, + ) -> Result { + Ok(gix_diff::blob::pipeline::Options { + large_file_threshold_bytes: self.big_file_threshold()?, + fs: self.fs_capabilities()?, + }) + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { + self.diff_renames + .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) + .copied() + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn big_file_threshold(&self) -> Result { + Ok(self + .resolved + .integer_by_key("core.bigFileThreshold") + .map(|number| Core::BIG_FILE_THRESHOLD.try_into_u64(number)) + .transpose() + .with_leniency(self.lenient_config)? + .unwrap_or(512 * 1024 * 1024)) + } + /// Returns a user agent for use with servers. #[cfg(any(feature = "async-network-client", feature = "blocking-network-client"))] pub(crate) fn user_agent_tuple(&self) -> (&'static str, Option>) { @@ -92,13 +183,6 @@ impl Cache { }) } - #[cfg(feature = "blob-diff")] - pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { - self.diff_renames - .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) - .copied() - } - /// Returns (file-timeout, pack-refs timeout) pub(crate) fn lock_timeout( &self, diff --git a/gix/src/config/mod.rs b/gix/src/config/mod.rs index 33c97fac687..2db9e1e6bd3 100644 --- a/gix/src/config/mod.rs +++ b/gix/src/config/mod.rs @@ -122,6 +122,36 @@ pub mod diff { Unimplemented { name: BString }, } } + + /// + pub mod pipeline_options { + /// The error produced when obtaining options needed to fill in [gix_diff::blob::pipeline::Options]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + FilesystemCapabilities(#[from] crate::config::boolean::Error), + #[error(transparent)] + BigFileThreshold(#[from] crate::config::unsigned_integer::Error), + } + } + + /// + pub mod drivers { + use crate::bstr::BString; + + /// The error produced when obtaining a list of [Drivers](gix_diff::blob::Driver). + #[derive(Debug, thiserror::Error)] + #[error("Failed to parse value of 'diff.{name}.{attribute}'")] + pub struct Error { + /// The name fo the driver. + pub name: BString, + /// The name of the attribute we tried to parse. + pub attribute: &'static str, + /// The actual error that occurred. + pub source: Box, + } + } } /// diff --git a/gix/src/config/tree/sections/core.rs b/gix/src/config/tree/sections/core.rs index 2ec5c279ea3..50df2a77d9c 100644 --- a/gix/src/config/tree/sections/core.rs +++ b/gix/src/config/tree/sections/core.rs @@ -8,6 +8,9 @@ impl Core { pub const ABBREV: Abbrev = Abbrev::new_with_validate("abbrev", &config::Tree::CORE, validate::Abbrev); /// The `core.bare` key. pub const BARE: keys::Boolean = keys::Boolean::new_boolean("bare", &config::Tree::CORE); + /// The `core.bigFileThreshold` key. + pub const BIG_FILE_THRESHOLD: keys::UnsignedInteger = + keys::UnsignedInteger::new_unsigned_integer("bigFileThreshold", &config::Tree::CORE); /// The `core.checkStat` key. pub const CHECK_STAT: CheckStat = CheckStat::new_with_validate("checkStat", &config::Tree::CORE, validate::CheckStat); @@ -95,6 +98,7 @@ impl Section for Core { &[ &Self::ABBREV, &Self::BARE, + &Self::BIG_FILE_THRESHOLD, &Self::CHECK_STAT, &Self::DELTA_BASE_CACHE_LIMIT, &Self::DISAMBIGUATE, diff --git a/gix/src/config/tree/sections/diff.rs b/gix/src/config/tree/sections/diff.rs index 7c467b8f523..df84d9f4c26 100644 --- a/gix/src/config/tree/sections/diff.rs +++ b/gix/src/config/tree/sections/diff.rs @@ -1,3 +1,4 @@ +use crate::config::tree::SubSectionRequirement; use crate::{ config, config::tree::{keys, Diff, Key, Section}, @@ -17,6 +18,20 @@ impl Diff { ); /// The `diff.renames` key. pub const RENAMES: Renames = Renames::new_renames("renames", &config::Tree::DIFF); + + /// The `diff..command` key. + pub const DRIVER_COMMAND: keys::String = keys::String::new_string("command", &config::Tree::DIFF) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..textconv` key. + pub const DRIVER_TEXTCONV: keys::String = keys::String::new_string("textconv", &config::Tree::DIFF) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..algorithm` key. + pub const DRIVER_ALGORITHM: Algorithm = + Algorithm::new_with_validate("algorithm", &config::Tree::DIFF, validate::Algorithm) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..binary` key. + pub const DRIVER_BINARY: Binary = Binary::new_with_validate("binary", &config::Tree::DIFF, validate::Binary) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); } impl Section for Diff { @@ -25,7 +40,15 @@ impl Section for Diff { } fn keys(&self) -> &[&dyn Key] { - &[&Self::ALGORITHM, &Self::RENAME_LIMIT, &Self::RENAMES] + &[ + &Self::ALGORITHM, + &Self::RENAME_LIMIT, + &Self::RENAMES, + &Self::DRIVER_COMMAND, + &Self::DRIVER_TEXTCONV, + &Self::DRIVER_ALGORITHM, + &Self::DRIVER_BINARY, + ] } } @@ -35,6 +58,9 @@ pub type Algorithm = keys::Any; /// The `diff.renames` key. pub type Renames = keys::Any; +/// The `diff..binary` key. +pub type Binary = keys::Any; + mod algorithm { use std::borrow::Cow; @@ -67,6 +93,38 @@ mod algorithm { } } +mod binary { + use crate::config::tree::diff::Binary; + + impl Binary { + /// Convert `value` into a tri-state boolean that can take the special value `auto`, resulting in `None`, or is a boolean. + /// If `None` is given, it's treated as implicit boolean `true`, as this method is made to be used + /// with [`gix_config::file::section::Body::value_implicit()`]. + pub fn try_into_binary( + &'static self, + value: Option>, + ) -> Result, crate::config::key::GenericErrorWithValue> { + Ok(match value { + None => Some(true), + Some(value) => { + if value.as_ref() == "auto" { + None + } else { + Some( + gix_config::Boolean::try_from(value.as_ref()) + .map(|b| b.0) + .map_err(|err| { + crate::config::key::GenericErrorWithValue::from_value(self, value.into_owned()) + .with_source(err) + })?, + ) + } + } + }) + } + } +} + mod renames { use crate::{ bstr::ByteSlice, @@ -125,4 +183,12 @@ mod validate { Ok(()) } } + + pub struct Binary; + impl keys::Validate for Binary { + fn validate(&self, value: &BStr) -> Result<(), Box> { + Diff::DRIVER_BINARY.try_into_binary(Some(value.into()))?; + Ok(()) + } + } } diff --git a/gix/src/diff.rs b/gix/src/diff.rs index 445698cea39..89fd7315e80 100644 --- a/gix/src/diff.rs +++ b/gix/src/diff.rs @@ -22,6 +22,7 @@ mod utils { use crate::config::cache::util::ApplyLeniency; use crate::config::tree::Diff; use crate::diff::rename::Tracking; + use crate::Repository; use gix_diff::rewrites::Copies; use gix_diff::Rewrites; @@ -38,6 +39,27 @@ mod utils { } } + /// + pub mod resource_cache { + /// The error returned by [`resource_cache()`](super::resource_cache()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + DiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error(transparent)] + WorktreeFilterOptions(#[from] crate::filter::pipeline::options::Error), + #[error(transparent)] + DiffDrivers(#[from] crate::config::diff::drivers::Error), + #[error(transparent)] + DiffPipelineOptions(#[from] crate::config::diff::pipeline_options::Error), + #[error(transparent)] + CommandContext(#[from] crate::config::command_context::Error), + #[error(transparent)] + AttributeStack(#[from] crate::config::attribute_stack::Error), + } + } + /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. /// Returns `Ok(None)` if nothing is configured. /// @@ -75,6 +97,46 @@ mod utils { } .into()) } + + /// Return a low-level utility to efficiently prepare a the blob-level diff operation between two resources, + /// and cache these diffable versions so that matrix-like MxN diffs are efficient. + /// + /// `repo` is used to obtain the needed configuration values, and `index` is used to potentially read `.gitattributes` + /// files from which may affect the diff operation. + /// `mode` determines how the diffable files will look like, and also how fast, in average, these conversions are. + /// `roots` provide information about where to get diffable data from, so source and destination can either be sourced from + /// a worktree, or from the object database, or both. + pub fn resource_cache( + repo: &Repository, + index: &gix_index::State, + mode: gix_diff::blob::pipeline::Mode, + roots: gix_diff::blob::pipeline::WorktreeRoots, + ) -> Result { + let diff_algo = repo.config.diff_algorithm()?; + let diff_cache = gix_diff::blob::Platform::new( + gix_diff::blob::platform::Options { + algorithm: Some(diff_algo), + skip_internal_diff_if_external_is_configured: false, + }, + gix_diff::blob::Pipeline::new( + roots, + gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(repo)?), + repo.config.diff_drivers()?, + repo.config.diff_pipeline_options()?, + ), + mode, + repo.attributes_only( + // TODO(perf): this could benefit from not having to build an intermediate index, + // and traverse the a tree directly. + index, + // This is an optimization, as we avoid reading files from the working tree, which also + // might not match the index at all depending on what the user passed. + gix_worktree::stack::state::attributes::Source::IdMapping, + )? + .inner, + ); + Ok(diff_cache) + } } #[cfg(feature = "blob-diff")] -pub use utils::new_rewrites; +pub use utils::{new_rewrites, resource_cache}; diff --git a/gix/src/object/tree/diff/for_each.rs b/gix/src/object/tree/diff/for_each.rs index 404e804327d..85d809fa83b 100644 --- a/gix/src/object/tree/diff/for_each.rs +++ b/gix/src/object/tree/diff/for_each.rs @@ -12,10 +12,12 @@ pub enum Error { Diff(#[from] gix_diff::tree::changes::Error), #[error("The user-provided callback failed")] ForEach(#[source] Box), - #[error("Could not configure diff algorithm prior to checking similarity")] - ConfigureDiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error(transparent)] + ResourceCache(#[from] crate::diff::resource_cache::Error), #[error("Failure during rename tracking")] RenameTracking(#[from] tracker::emit::Error), + #[error("Index for use in attribute stack could not be loaded")] + OpenIndex(#[from] crate::repository::index_or_load_from_head::Error), } /// @@ -36,18 +38,51 @@ impl<'a, 'old> Platform<'a, 'old> { other: &Tree<'new>, for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, ) -> Result + where + E: std::error::Error + Sync + Send + 'static, + { + self.for_each_to_obtain_tree_inner(other, for_each, None) + } + + /// Like [`Self::for_each_to_obtain_tree()`], but with a reusable `resource_cache` which is used to perform + /// diffs fast. + /// + /// Reusing it between multiple invocations saves a lot of IOps as it avoids the creation + /// of a temporary `resource_cache` that triggers reading or checking for multiple gitattribute files. + /// Note that it's recommended to call [`gix_diff::blob::Platform::clear_resource_cache()`] between the calls + /// to avoid runaway memory usage, as the cache isn't limited. + /// + /// Note that to do rename tracking like `git` does, one has to configure the `resource_cache` with + /// a conversion pipeline that uses [`gix_diff::blob::pipeline::Mode::ToGit`]. + pub fn for_each_to_obtain_tree_with_cache<'new, E>( + &mut self, + other: &Tree<'new>, + resource_cache: &mut gix_diff::blob::Platform, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, + ) -> Result + where + E: std::error::Error + Sync + Send + 'static, + { + self.for_each_to_obtain_tree_inner(other, for_each, Some(resource_cache)) + } + + fn for_each_to_obtain_tree_inner<'new, E>( + &mut self, + other: &Tree<'new>, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, + resource_cache: Option<&mut gix_diff::blob::Platform>, + ) -> Result where E: std::error::Error + Sync + Send + 'static, { let repo = self.lhs.repo; - let diff_algo = repo.config.diff_algorithm()?; let mut delegate = Delegate { src_tree: self.lhs, other_repo: other.repo, recorder: gix_diff::tree::Recorder::default().track_location(self.tracking), visit: for_each, location: self.tracking, - tracked: self.rewrites.map(|r| rewrites::Tracker::new(r, diff_algo)), + tracked: self.rewrites.map(rewrites::Tracker::new), err: None, }; match gix_diff::tree::Changes::from(TreeRefIter::from_bytes(&self.lhs.data)).needed_to_obtain( @@ -58,7 +93,7 @@ impl<'a, 'old> Platform<'a, 'old> { ) { Ok(()) => { let outcome = Outcome { - rewrites: delegate.process_tracked_changes()?, + rewrites: delegate.process_tracked_changes(resource_cache)?, }; match delegate.err { Some(err) => Err(Error::ForEach(Box::new(err))), @@ -131,12 +166,33 @@ where } } - fn process_tracked_changes(&mut self) -> Result, Error> { + fn process_tracked_changes( + &mut self, + diff_cache: Option<&mut gix_diff::blob::Platform>, + ) -> Result, Error> { let tracked = match self.tracked.as_mut() { Some(t) => t, None => return Ok(None), }; + let repo = self.src_tree.repo; + let mut storage; + let diff_cache = match diff_cache { + Some(cache) => cache, + None => { + storage = crate::diff::resource_cache( + repo, + // NOTE: we could easily load the index at the source or destination tree, + // but even that isn't perfectly correct as there is only one, used for both sides. + // This is how `git` does it (or at least so it seems). + &*repo.index_or_load_from_head()?, + gix_diff::blob::pipeline::Mode::ToGit, + Default::default(), + )?; + &mut storage + } + }; + let outcome = tracked.emit( |dest, source| match source { Some(source) => { @@ -174,6 +230,7 @@ where &mut self.err, ), }, + diff_cache, &self.src_tree.repo.objects, |push| { self.src_tree diff --git a/gix/tests/config/tree.rs b/gix/tests/config/tree.rs index fd844f8b0f0..9ddb8d0470e 100644 --- a/gix/tests/config/tree.rs +++ b/gix/tests/config/tree.rs @@ -243,6 +243,37 @@ mod diff { Ok(()) } + #[test] + fn driver_binary() -> crate::Result { + assert_eq!( + Diff::DRIVER_BINARY.try_into_binary(Some(bcow("auto")))?, + None, + "this is as good as not setting it, but it's a valid value that would fail if it was just a boolean. It's undocumented though…" + ); + assert!(Diff::DRIVER_BINARY.validate("auto".into()).is_ok()); + + for (actual, expected) in [ + (Some("true"), Some(true)), + (Some("false"), Some(false)), + (None, Some(true)), + ] { + assert_eq!(Diff::DRIVER_BINARY.try_into_binary(actual.map(bcow))?, expected); + if let Some(value) = actual { + assert!(Diff::DRIVER_BINARY.validate(value.into()).is_ok()); + } + } + + assert_eq!( + Diff::DRIVER_BINARY + .try_into_binary(Some(bcow("something"))) + .unwrap_err() + .to_string(), + "The key \"diff..binary=something\" was invalid", + ); + assert!(Diff::DRIVER_BINARY.validate("foo".into()).is_err()); + Ok(()) + } + #[test] fn algorithm() -> crate::Result { for (actual, expected) in [ diff --git a/gix/tests/diff/mod.rs b/gix/tests/diff/mod.rs new file mode 100644 index 00000000000..c89681e7012 --- /dev/null +++ b/gix/tests/diff/mod.rs @@ -0,0 +1,46 @@ +use crate::util::named_repo; +use gix_diff::blob::{Algorithm, Driver}; + +#[test] +fn resource_cache() -> crate::Result { + let repo = named_repo("make_diff_repo.sh")?; + let cache = gix::diff::resource_cache( + &repo, + &*repo.index()?, + gix::diff::blob::pipeline::Mode::ToWorktreeAndBinaryToText, + Default::default(), + )?; + assert_eq!( + cache.filter.drivers(), + &[ + Driver { + name: "all-but-binary".into(), + command: Some("command".into()), + algorithm: Some(Algorithm::Histogram), + binary_to_text_command: Some("textconv".into()), + is_binary: None + }, + Driver { + name: "binary-false".into(), + is_binary: Some(false), + ..Default::default() + }, + Driver { + name: "binary-true".into(), + is_binary: Some(true), + ..Default::default() + } + ] + ); + assert_eq!(cache.options.algorithm, Some(Algorithm::Histogram)); + assert!( + !cache.options.skip_internal_diff_if_external_is_configured, + "pre-set to something that makes sense for most" + ); + assert_eq!( + cache.filter.options.large_file_threshold_bytes, + 512 * 1024 * 1024, + "the default value unless it's configured" + ); + Ok(()) +} diff --git a/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz b/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz index 0d6b76452b7..d4b13341e89 100644 --- a/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz +++ b/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dceeedc3ee706632030f8317f2245c327456ec58a6ac690d09099557a5aa23b0 -size 32336 +oid sha256:66f54895c75e2eebaad514224242522c19e2b16b12a48f22a059b438745e0327 +size 32444 diff --git a/gix/tests/fixtures/make_diff_repo.sh b/gix/tests/fixtures/make_diff_repo.sh index 1aac40b43a7..1175cf4ccd9 100755 --- a/gix/tests/fixtures/make_diff_repo.sh +++ b/gix/tests/fixtures/make_diff_repo.sh @@ -3,6 +3,24 @@ set -eu -o pipefail git init -q +cat <>.git/config + +[diff "binary-true"] + binary = true +[diff "binary-false"] + binary = false +[diff ""] + command = "empty is ignored" +[diff] + command = "this is also ignored as sub-section name is missing" + algorithm = histogram +[diff "all-but-binary"] + command = command + textconv = textconv + algorithm = histogram + binary = auto +EOF + git checkout -b main mkdir dir touch a b dir/c d diff --git a/gix/tests/gix.rs b/gix/tests/gix.rs index 4721e0d4eb2..81008faca89 100644 --- a/gix/tests/gix.rs +++ b/gix/tests/gix.rs @@ -4,6 +4,8 @@ use util::*; mod clone; mod commit; mod config; +#[cfg(feature = "blob-diff")] +mod diff; mod head; mod id; mod init; diff --git a/src/plumbing/progress.rs b/src/plumbing/progress.rs index 21b56fa71d8..21b0ab9af9d 100644 --- a/src/plumbing/progress.rs +++ b/src/plumbing/progress.rs @@ -124,10 +124,6 @@ static GIT_CONFIG: &[Record] = &[ config: "core.alternateRefsPrefixes", usage: NotPlanned { reason: "seems like a niche feature, but can be implemented if there is demand" } }, - Record { - config: "core.bigFileThreshold", - usage: Planned { note: Some("unfortunately we can't stream packed files yet, even if not delta-compressed, but respecting the threshold for other operations is definitely a must") } - }, Record { config: "core.compression", usage: Planned { note: Some("Allow to remove similar hardcoded value - passing it through will be some effort") },