From 8196a433ed08de6b09b5cb187f8ce53fc2ab09ca Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 25 Dec 2024 17:06:43 +0100 Subject: [PATCH] For linear histories, avoid redoing path lookup work Also, set a fixed and higher pack-cache to double typical pack-decode performance in mid-sized repositories. Additionally, normalize the input path. --- gitoxide-core/src/repository/blame.rs | 33 +++++++++++++++++---- gix-blame/src/file/function.rs | 41 +++++++++++++++++---------- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/gitoxide-core/src/repository/blame.rs b/gitoxide-core/src/repository/blame.rs index c130770e9dc..fea525035fa 100644 --- a/gitoxide-core/src/repository/blame.rs +++ b/gitoxide-core/src/repository/blame.rs @@ -1,4 +1,5 @@ -use gix::bstr::BStr; +use gix::bstr::ByteSlice; +use gix::config::tree; use std::ffi::OsStr; pub fn blame_file( @@ -7,7 +8,31 @@ pub fn blame_file( out: impl std::io::Write, err: Option<&mut dyn std::io::Write>, ) -> anyhow::Result<()> { - repo.object_cache_size_if_unset(repo.compute_object_cache_size_for_tree_diffs(&**repo.index_or_empty()?)); + { + let mut config = repo.config_snapshot_mut(); + if config.string(&tree::Core::DELTA_BASE_CACHE_LIMIT).is_none() { + config.set_value(&tree::Core::DELTA_BASE_CACHE_LIMIT, "100m")?; + } + } + let index = repo.index_or_empty()?; + repo.object_cache_size_if_unset(repo.compute_object_cache_size_for_tree_diffs(&index)); + + let file = gix::path::os_str_into_bstr(file)?; + let specs = repo.pathspec( + false, + [file], + true, + &index, + gix::worktree::stack::state::attributes::Source::WorktreeThenIdMapping.adjust_for_bare(repo.is_bare()), + )?; + // TODO: there should be a way to normalize paths without going through patterns, at least in this case maybe? + // `Search` actually sorts patterns by excluding or not, all that can lead to strange results. + let file = specs + .search() + .patterns() + .map(|p| p.path().to_owned()) + .next() + .expect("exactly one pattern"); let suspect = repo.head()?.peel_to_commit_in_place()?; let traverse = @@ -15,9 +40,7 @@ pub fn blame_file( .with_commit_graph(repo.commit_graph_if_enabled()?) .build()?; let mut resource_cache = repo.diff_resource_cache_for_tree_diff()?; - let file_path: &BStr = gix::path::os_str_into_bstr(file)?; - - let outcome = gix::blame::file(&repo.objects, traverse, &mut resource_cache, file_path)?; + let outcome = gix::blame::file(&repo.objects, traverse, &mut resource_cache, file.as_bstr())?; let statistics = outcome.statistics; write_blame_entries(out, outcome)?; diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index 59a63786687..16384638e36 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -69,12 +69,12 @@ where let mut stats = Statistics::default(); let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new()); - let blamed_file_entry = find_path_entry_in_commit(&odb, &suspect, file_path, &mut buf, &mut buf2, &mut stats)? + let blamed_file_entry_id = find_path_entry_in_commit(&odb, &suspect, file_path, &mut buf, &mut buf2, &mut stats)? .ok_or_else(|| Error::FileMissing { - file_path: file_path.to_owned(), - commit_id: suspect, - })?; - let blamed_file_blob = odb.find_blob(&blamed_file_entry.oid, &mut buf)?.data.to_vec(); + file_path: file_path.to_owned(), + commit_id: suspect, + })?; + let blamed_file_blob = odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec(); let num_lines_in_blamed = { let mut interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100); tokens_for_diffing(&blamed_file_blob) @@ -98,6 +98,7 @@ where let mut out = Vec::new(); let mut diff_state = gix_diff::tree::State::default(); + let mut previous_entry: Option<(ObjectId, ObjectId)> = None; 'outer: while let Some(item) = traverse.next() { if hunks_to_blame.is_empty() { break; @@ -123,15 +124,27 @@ where continue; } - let Some(entry) = find_path_entry_in_commit(&odb, &suspect, file_path, &mut buf, &mut buf2, &mut stats)? else { + let mut entry = previous_entry + .take() + .filter(|(id, _)| *id == suspect) + .map(|(_, entry)| entry); + if entry.is_none() { + entry = find_path_entry_in_commit(&odb, &suspect, file_path, &mut buf, &mut buf2, &mut stats)?; + } + + let Some(entry_id) = entry else { continue; }; - for parent_id in &parent_ids { - if let Some(parent_entry) = + for (pid, parent_id) in parent_ids.iter().enumerate() { + if let Some(parent_entry_id) = find_path_entry_in_commit(&odb, parent_id, file_path, &mut buf, &mut buf2, &mut stats)? { - if entry.oid == parent_entry.oid { + let no_change_in_entry = entry_id == parent_entry_id; + if pid == 0 { + previous_entry = Some((*parent_id, parent_entry_id)); + } + if no_change_in_entry { pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame); continue 'outer; } @@ -170,10 +183,8 @@ where // Do nothing under the assumption that this always (or almost always) // implies that the file comes from a different parent, compared to which // it was modified, not added. - } else { - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } + } else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { + break 'outer; } } gix_diff::tree::recorder::Change::Deletion { .. } => { @@ -418,7 +429,7 @@ fn find_path_entry_in_commit( buf: &mut Vec, buf2: &mut Vec, stats: &mut Statistics, -) -> Result, Error> { +) -> Result, Error> { let commit_id = odb.find_commit(commit, buf)?.tree(); stats.commits_to_tree += 1; let tree_iter = odb.find_tree_iter(&commit_id, buf)?; @@ -430,7 +441,7 @@ fn find_path_entry_in_commit( file_path.split(|b| *b == b'/').inspect(|_| stats.trees_decoded += 1), )?; stats.trees_decoded -= 1; - Ok(res) + Ok(res.map(|e| e.oid)) } /// Return an iterator over tokens for use in diffing. These usually lines, but iit's important to unify them