From 8f795e8abf706a24fe104500bf15efaa2bc07b15 Mon Sep 17 00:00:00 2001 From: Cameron Esfahani <cesfahani@roku.com> Date: Thu, 9 Nov 2023 10:53:31 +0100 Subject: [PATCH] feat: add basic connectivity check Implement a simple connectivity check in a new `gix-fsck` crate, and add this to `gix` via a new `fsck` subcommand. Currently this is functionally equivalent to: `git rev-list --objects --quiet --missing=print` feat: add basic connectivity check - address review feedback --- Cargo.lock | 12 ++ Cargo.toml | 1 + README.md | 1 + crate-status.md | 13 ++ gitoxide-core/Cargo.toml | 1 + gitoxide-core/src/repository/fsck.rs | 40 ++++++ gitoxide-core/src/repository/mod.rs | 1 + gix-fsck/Cargo.toml | 22 +++ gix-fsck/Changelog.md | 6 + gix-fsck/LICENSE-APACHE | 1 + gix-fsck/LICENSE-MIT | 1 + gix-fsck/src/lib.rs | 127 ++++++++++++++++++ gix-fsck/tests/connectivity/mod.rs | 84 ++++++++++++ .../fixtures/generated-archives/.gitignore | 1 + gix-fsck/tests/fixtures/make_test_repos.sh | 32 +++++ gix-fsck/tests/fsck.rs | 8 ++ src/plumbing/main.rs | 15 ++- src/plumbing/options/mod.rs | 14 ++ 18 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 gitoxide-core/src/repository/fsck.rs create mode 100644 gix-fsck/Cargo.toml create mode 100644 gix-fsck/Changelog.md create mode 120000 gix-fsck/LICENSE-APACHE create mode 120000 gix-fsck/LICENSE-MIT create mode 100644 gix-fsck/src/lib.rs create mode 100644 gix-fsck/tests/connectivity/mod.rs create mode 100644 gix-fsck/tests/fixtures/generated-archives/.gitignore create mode 100755 gix-fsck/tests/fixtures/make_test_repos.sh create mode 100644 gix-fsck/tests/fsck.rs diff --git a/Cargo.lock b/Cargo.lock index 6433d14dea5..6f207b9dfc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1190,6 +1190,7 @@ dependencies = [ "futures-lite", "gix", "gix-archive", + "gix-fsck", "gix-pack", "gix-status", "gix-transport", @@ -1649,6 +1650,17 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-fsck" +version = "0.0.0" +dependencies = [ + "gix-hash 0.13.1", + "gix-hashtable 0.4.0", + "gix-object 0.38.0", + "gix-odb", + "gix-testtools", +] + [[package]] name = "gix-glob" version = "0.7.0" diff --git a/Cargo.toml b/Cargo.toml index 0234c89edcb..8bb402b772e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -281,6 +281,7 @@ members = [ "gix-archive", "gix-worktree-stream", "gix-revwalk", + "gix-fsck", "tests/tools", diff --git a/README.md b/README.md index 44952b4dcf7..3d8c283eb63 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ is usable to some extent. * [gix-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tui) * [gix-tix](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tix) * [gix-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bundle) + * [gix-fsck](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-fsck) ### Stress Testing * [x] Verify huge packs diff --git a/crate-status.md b/crate-status.md index 134a43c9b32..dfe112c77ef 100644 --- a/crate-status.md +++ b/crate-status.md @@ -775,6 +775,19 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. * [x] validate submodule names * [x] [validate][tagname-validation] tag names +### gix-fsck +* [x] validate connectivity when provided a specific commit as a starting point +* [ ] validate connectivity of all `refs` in the index +* [ ] progress reporting and interruptability +* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit) +* [ ] add support for various options + * [ ] write dangling objects to the `.git/log-found` directory structure + * [ ] `strict` mode, to check for tree objects with `g+w` permissions + * [ ] consider reflog entries as reference nodes/heads + * [ ] when reporting reachable objects, include _how_ they are reachable + * [ ] which reference node(s) made them reachable + * [ ] parent commit(s) + ### gix-ref * [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well. * **Stores** diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml index 3312c6724dc..aec7996081b 100644 --- a/gitoxide-core/Cargo.toml +++ b/gitoxide-core/Cargo.toml @@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false } gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] } gix-status = { version = "^0.2.0", path = "../gix-status" } +gix-fsck = { version = "^0.0.0", path = "../gix-fsck" } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } anyhow = "1.0.42" thiserror = "1.0.34" diff --git a/gitoxide-core/src/repository/fsck.rs b/gitoxide-core/src/repository/fsck.rs new file mode 100644 index 00000000000..1628e62adb7 --- /dev/null +++ b/gitoxide-core/src/repository/fsck.rs @@ -0,0 +1,40 @@ +use std::io::{BufWriter, Write}; + +use anyhow::Context; +use gix::{objs::Kind, ObjectId}; + +pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> { + let mut out = BufWriter::with_capacity(64 * 1024, out); + let spec = spec.unwrap_or("HEAD".into()); + + repo.object_cache_size_if_unset(4 * 1024 * 1024); + // We expect to be finding a bunch of non-existent objects here - never refresh the ODB + repo.objects.refresh_never(); + + let id = repo + .rev_parse_single(spec.as_str()) + .context("Only single revisions are supported")?; + let commits: gix::revision::Walk<'_> = id + .object()? + .peel_to_kind(gix::object::Kind::Commit) + .context("Need commitish as starting point")? + .id() + .ancestors() + .all()?; + + let missing_cb = |oid: &ObjectId, kind: Kind| { + writeln!(out, "{oid}: {kind}").expect("failed to write output"); + }; + let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb); + + // Walk all commits, checking each one for connectivity + for commit in commits { + let commit = commit?; + conn.check_commit(&commit.id); + for parent in commit.parent_ids { + conn.check_commit(&parent); + } + } + + Ok(()) +} diff --git a/gitoxide-core/src/repository/mod.rs b/gitoxide-core/src/repository/mod.rs index c78e82edb85..4b97259b71a 100644 --- a/gitoxide-core/src/repository/mod.rs +++ b/gitoxide-core/src/repository/mod.rs @@ -35,6 +35,7 @@ pub use clone::function::clone; pub use fetch::function::fetch; pub mod commitgraph; +pub mod fsck; pub mod index; pub mod mailmap; pub mod odb; diff --git a/gix-fsck/Cargo.toml b/gix-fsck/Cargo.toml new file mode 100644 index 00000000000..172c8cdb35f --- /dev/null +++ b/gix-fsck/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "gix-fsck" +version = "0.0.0" +repository = "https://github.com/Byron/gitoxide" +authors = ["Cameron Esfahani <cesfahani@gmail.com>", "Sebastian Thiel <sebastian.thiel@icloud.com>"] +license = "MIT OR Apache-2.0" +description = "Verifies the connectivity and validity of objects in the database" +edition = "2021" +include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"] +rust-version = "1.65" + +[lib] +doctest = false + +[dependencies] +gix-hash = { version = "^0.13.1", path = "../gix-hash" } +gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" } +gix-object = { version = "^0.38.0", path = "../gix-object" } + +[dev-dependencies] +gix-odb = { path = "../gix-odb" } +gix-testtools = { path = "../tests/tools"} diff --git a/gix-fsck/Changelog.md b/gix-fsck/Changelog.md new file mode 100644 index 00000000000..1d013ff92fa --- /dev/null +++ b/gix-fsck/Changelog.md @@ -0,0 +1,6 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). diff --git a/gix-fsck/LICENSE-APACHE b/gix-fsck/LICENSE-APACHE new file mode 120000 index 00000000000..965b606f331 --- /dev/null +++ b/gix-fsck/LICENSE-APACHE @@ -0,0 +1 @@ +../LICENSE-APACHE \ No newline at end of file diff --git a/gix-fsck/LICENSE-MIT b/gix-fsck/LICENSE-MIT new file mode 120000 index 00000000000..76219eb72e8 --- /dev/null +++ b/gix-fsck/LICENSE-MIT @@ -0,0 +1 @@ +../LICENSE-MIT \ No newline at end of file diff --git a/gix-fsck/src/lib.rs b/gix-fsck/src/lib.rs new file mode 100644 index 00000000000..6583697fc40 --- /dev/null +++ b/gix-fsck/src/lib.rs @@ -0,0 +1,127 @@ +//! A library for performing object database integrity and connectivity checks +#![deny(rust_2018_idioms)] + +use gix_hash::ObjectId; +use gix_hashtable::HashSet; +use gix_object::{tree::EntryMode, Exists, FindExt, Kind}; + +pub struct ConnectivityCheck<'a, T, F> +where + T: FindExt + Exists, + F: FnMut(&ObjectId, Kind), +{ + /// ODB handle to use for the check + db: &'a T, + /// Closure to invoke when a missing object is encountered + missing_cb: F, + /// Set of Object IDs already (or about to be) scanned during the check + oid_set: HashSet, + /// Single buffer for decoding objects from the ODB + /// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements) + buf: Vec<u8>, +} + +impl<'a, T, F> ConnectivityCheck<'a, T, F> +where + T: FindExt + Exists, + F: FnMut(&ObjectId, Kind), +{ + /// Instantiate a connectivity check + pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> { + ConnectivityCheck { + db, + missing_cb, + oid_set: HashSet::default(), + buf: Vec::new(), + } + } + + /// Run the connectivity check on the provided commit object ID + /// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB + /// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped + /// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb` + /// - Missing commits or trees will currently result in panic + /// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?) + pub fn check_commit(&mut self, oid: &ObjectId) { + // Attempt to insert the commit ID in the set, and if already present, return immediately + if !self.oid_set.insert(*oid) { + return; + } + // Obtain the commit's tree ID + let tree_id = { + let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit"); + commit.tree() + }; + + // Attempt to insert the tree ID in the set, and if already present, return immediately + if self.oid_set.insert(tree_id) { + self.check_tree(&tree_id); + } + } + + fn check_tree(&mut self, oid: &ObjectId) { + let tree = match self.db.find_tree(oid, &mut self.buf) { + Ok(tree) => tree, + Err(_) => { + // Tree is missing, so invoke `missing_cb` + (self.missing_cb)(oid, Kind::Tree); + return; + } + }; + + // Keeping separate sets for trees and blobs for now... + // This is about a wash when compared to using a HashMap<ObjectID, Kind> + struct TreeEntries { + trees: HashSet<ObjectId>, + blobs: HashSet<ObjectId>, + } + + // Build up a set of trees and a set of blobs + let entries: TreeEntries = { + let mut entries = TreeEntries { + trees: HashSet::default(), + blobs: HashSet::default(), + }; + + // For each entry in the tree + for entry_ref in tree.entries.iter() { + match entry_ref.mode { + EntryMode::Tree => { + let tree_id = entry_ref.oid.to_owned(); + // Check if the tree has already been encountered + if self.oid_set.insert(tree_id) { + entries.trees.insert(tree_id); + } + } + EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => { + let blob_id = entry_ref.oid.to_owned(); + // Check if the blob has already been encountered + if self.oid_set.insert(blob_id) { + entries.blobs.insert(blob_id); + } + } + EntryMode::Commit => { + // This implies a submodule (OID is the commit hash of the submodule) + // Skip it as it's not in this repository! + } + } + } + entries + }; + + for tree_id in entries.trees.iter() { + self.check_tree(tree_id); + } + for blob_id in entries.blobs.iter() { + self.check_blob(blob_id); + } + } + + fn check_blob(&mut self, oid: &ObjectId) { + // Check if the blob is missing from the ODB + if !self.db.exists(oid) { + // Blob is missing, so invoke `missing_cb` + (self.missing_cb)(oid, Kind::Blob); + } + } +} diff --git a/gix-fsck/tests/connectivity/mod.rs b/gix-fsck/tests/connectivity/mod.rs new file mode 100644 index 00000000000..4625fbf681e --- /dev/null +++ b/gix-fsck/tests/connectivity/mod.rs @@ -0,0 +1,84 @@ +use std::ops::Deref; + +use gix_fsck::ConnectivityCheck; +use gix_hash::ObjectId; +use gix_hashtable::HashMap; +use gix_object::Kind; +use gix_testtools::once_cell::sync::Lazy; + +use crate::hex_to_id; + +fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator<Item = &'a ObjectId>) -> HashMap<ObjectId, Kind> { + let db = { + let fixture_path = gix_testtools::scripted_fixture_read_only("make_test_repos.sh") + .expect("fixture path") + .join(repo_name) + .join(".git") + .join("objects"); + let mut db = gix_odb::at(fixture_path).expect("odb handle"); + db.refresh_never(); + db + }; + + let mut missing: HashMap<ObjectId, Kind> = HashMap::default(); + let callback = |oid: &ObjectId, kind: Kind| { + missing.try_insert(*oid, kind).expect("no duplicate oid"); + }; + + let mut check = ConnectivityCheck::new(&db, callback); + for commit in commits.into_iter() { + check.check_commit(commit); + } + + missing +} + +fn hex_to_ids<'a>(hex_ids: impl IntoIterator<Item = &'a str>) -> Vec<ObjectId> { + hex_ids.into_iter().map(hex_to_id).collect() +} + +fn hex_to_objects<'a>(hex_ids: impl IntoIterator<Item = &'a str>, kind: Kind) -> HashMap<ObjectId, Kind> { + hex_to_ids(hex_ids) + .into_iter() + .map(|id| (id, kind)) + .collect::<HashMap<_, _>>() +} + +// Get a `&Vec<ObjectID` for each commit in the test fixture repository +fn all_commits<'a>() -> &'a Vec<ObjectId> { + static ALL_COMMITS: Lazy<Vec<ObjectId>> = Lazy::new(|| { + hex_to_ids(vec![ + "5d18db2e2aabadf7b914435ef34f2faf8b4546dd", + "3a3dfaa55a515f3fb3a25751107bbb523af6a1b0", + "734c926856a328d1168ffd7088532e0d1ad19bbe", + ]) + }); + ALL_COMMITS.deref() +} + +#[test] +fn no_missing() { + // The "base" repo is the original, and has every object present + assert_eq!(check_missing("base", all_commits()), HashMap::default()); +} + +#[test] +fn missing_blobs() { + // The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob + let expected = hex_to_objects(vec!["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob); + assert_eq!(check_missing("blobless", all_commits()), expected); +} + +#[test] +fn missing_trees() { + // The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees + // NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing + let expected = hex_to_objects( + vec![ + "9561cfbae43c5e2accdfcd423378588dd10d827f", + "fc264b3b6875a46e9031483aeb9994a1b897ffd3", + ], + Kind::Tree, + ); + assert_eq!(check_missing("treeless", all_commits()), expected); +} diff --git a/gix-fsck/tests/fixtures/generated-archives/.gitignore b/gix-fsck/tests/fixtures/generated-archives/.gitignore new file mode 100644 index 00000000000..d47a40f0345 --- /dev/null +++ b/gix-fsck/tests/fixtures/generated-archives/.gitignore @@ -0,0 +1 @@ +make_test_repos.tar.xz diff --git a/gix-fsck/tests/fixtures/make_test_repos.sh b/gix-fsck/tests/fixtures/make_test_repos.sh new file mode 100755 index 00000000000..19491a0c551 --- /dev/null +++ b/gix-fsck/tests/fixtures/make_test_repos.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -x +set -euo pipefail + +# We override the global config with our own local one (see below) +export GIT_CONFIG_GLOBAL="$PWD/.gitconfig" + +# We need to be able to do partial clones, so enable it +# - needs to be present in the persistent gitconfig, as a clone with `--no-local` +git config --global uploadpack.allowFilter true + +# First build out a base repository +git init base +( + cd base + + echo "blob 1" > blob-1 + git add -A + git commit -m "commit 1" + echo "blob-2" > blob-2 + git add -A + git commit -m "commit 2" + git rm blob-1 + git add -A + git commit -m "commit 3" +) + +# Blobless clone +git clone --no-local --no-hardlinks --filter=blob:none ./base blobless + +# Treeless (and blobless) clone +git clone --no-local --no-hardlinks --filter=tree:0 ./base treeless diff --git a/gix-fsck/tests/fsck.rs b/gix-fsck/tests/fsck.rs new file mode 100644 index 00000000000..f12ebfebc25 --- /dev/null +++ b/gix-fsck/tests/fsck.rs @@ -0,0 +1,8 @@ +use gix_hash::ObjectId; +pub use gix_testtools::Result; + +pub fn hex_to_id(hex: &str) -> ObjectId { + ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex") +} + +mod connectivity; diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs index f36d0494b26..670ab1b391c 100644 --- a/src/plumbing/main.rs +++ b/src/plumbing/main.rs @@ -15,8 +15,8 @@ use gix::bstr::{io::BufReadExt, BString}; use crate::plumbing::{ options::{ - attributes, commit, commitgraph, config, credential, exclude, free, index, mailmap, odb, revision, tree, Args, - Subcommands, + attributes, commit, commitgraph, config, credential, exclude, free, fsck, index, mailmap, odb, revision, tree, + Args, Subcommands, }, show_progress, }; @@ -1072,6 +1072,17 @@ pub fn main() -> Result<()> { move |_progress, out, err| core::repository::odb::info(repository(Mode::Strict)?, format, out, err), ), }, + Subcommands::Fsck(cmd) => match cmd { + fsck::Subcommands::Connectivity { spec } => prepare_and_run( + "fsck-connectivity", + trace, + verbose, + progress, + progress_keep_open, + None, + move |_progress, out, _err| core::repository::fsck::connectivity(repository(Mode::Strict)?, spec, out), + ), + }, Subcommands::Mailmap(cmd) => match cmd { mailmap::Subcommands::Entries => prepare_and_run( "mailmap-entries", diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index f7b484aaf6f..1eb84ea8400 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -87,6 +87,9 @@ pub enum Subcommands { /// Interact with the object database. #[clap(subcommand)] Odb(odb::Subcommands), + /// Perform integrity checks on the repository. + #[clap(subcommand)] + Fsck(fsck::Subcommands), /// Interact with tree objects. #[clap(subcommand)] Tree(tree::Subcommands), @@ -489,6 +492,17 @@ pub mod odb { } } +pub mod fsck { + #[derive(Debug, clap::Subcommand)] + pub enum Subcommands { + /// Perform a connectivity check on the repository. + Connectivity { + /// A revspec to start the connectivity check from. + spec: Option<String>, + }, + } +} + pub mod tree { #[derive(Debug, clap::Subcommand)] pub enum Subcommands {