From 8f795e8abf706a24fe104500bf15efaa2bc07b15 Mon Sep 17 00:00:00 2001
From: Cameron Esfahani <cesfahani@roku.com>
Date: Thu, 9 Nov 2023 10:53:31 +0100
Subject: [PATCH] feat: add basic connectivity check

Implement a simple connectivity check in a new `gix-fsck` crate, and add
this to `gix` via a new `fsck` subcommand. Currently this is
functionally equivalent to:
`git rev-list --objects --quiet --missing=print`

feat: add basic connectivity check - address review feedback
---
 Cargo.lock                                    |  12 ++
 Cargo.toml                                    |   1 +
 README.md                                     |   1 +
 crate-status.md                               |  13 ++
 gitoxide-core/Cargo.toml                      |   1 +
 gitoxide-core/src/repository/fsck.rs          |  40 ++++++
 gitoxide-core/src/repository/mod.rs           |   1 +
 gix-fsck/Cargo.toml                           |  22 +++
 gix-fsck/Changelog.md                         |   6 +
 gix-fsck/LICENSE-APACHE                       |   1 +
 gix-fsck/LICENSE-MIT                          |   1 +
 gix-fsck/src/lib.rs                           | 127 ++++++++++++++++++
 gix-fsck/tests/connectivity/mod.rs            |  84 ++++++++++++
 .../fixtures/generated-archives/.gitignore    |   1 +
 gix-fsck/tests/fixtures/make_test_repos.sh    |  32 +++++
 gix-fsck/tests/fsck.rs                        |   8 ++
 src/plumbing/main.rs                          |  15 ++-
 src/plumbing/options/mod.rs                   |  14 ++
 18 files changed, 378 insertions(+), 2 deletions(-)
 create mode 100644 gitoxide-core/src/repository/fsck.rs
 create mode 100644 gix-fsck/Cargo.toml
 create mode 100644 gix-fsck/Changelog.md
 create mode 120000 gix-fsck/LICENSE-APACHE
 create mode 120000 gix-fsck/LICENSE-MIT
 create mode 100644 gix-fsck/src/lib.rs
 create mode 100644 gix-fsck/tests/connectivity/mod.rs
 create mode 100644 gix-fsck/tests/fixtures/generated-archives/.gitignore
 create mode 100755 gix-fsck/tests/fixtures/make_test_repos.sh
 create mode 100644 gix-fsck/tests/fsck.rs

diff --git a/Cargo.lock b/Cargo.lock
index 6433d14dea5..6f207b9dfc8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1190,6 +1190,7 @@ dependencies = [
  "futures-lite",
  "gix",
  "gix-archive",
+ "gix-fsck",
  "gix-pack",
  "gix-status",
  "gix-transport",
@@ -1649,6 +1650,17 @@ dependencies = [
  "tempfile",
 ]
 
+[[package]]
+name = "gix-fsck"
+version = "0.0.0"
+dependencies = [
+ "gix-hash 0.13.1",
+ "gix-hashtable 0.4.0",
+ "gix-object 0.38.0",
+ "gix-odb",
+ "gix-testtools",
+]
+
 [[package]]
 name = "gix-glob"
 version = "0.7.0"
diff --git a/Cargo.toml b/Cargo.toml
index 0234c89edcb..8bb402b772e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -281,6 +281,7 @@ members = [
     "gix-archive",
     "gix-worktree-stream",
     "gix-revwalk",
+    "gix-fsck",
 
     "tests/tools",
 
diff --git a/README.md b/README.md
index 44952b4dcf7..3d8c283eb63 100644
--- a/README.md
+++ b/README.md
@@ -140,6 +140,7 @@ is usable to some extent.
   * [gix-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tui)
   * [gix-tix](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tix)
   * [gix-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bundle)
+  * [gix-fsck](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-fsck)
   
 ### Stress Testing
   * [x] Verify huge packs
diff --git a/crate-status.md b/crate-status.md
index 134a43c9b32..dfe112c77ef 100644
--- a/crate-status.md
+++ b/crate-status.md
@@ -775,6 +775,19 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
 * [x] validate submodule names
 * [x] [validate][tagname-validation] tag names
 
+### gix-fsck
+* [x] validate connectivity when provided a specific commit as a starting point
+* [ ] validate connectivity of all `refs` in the index
+* [ ] progress reporting and interruptability
+* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit)
+* [ ] add support for various options
+    * [ ] write dangling objects to the `.git/log-found` directory structure
+    * [ ] `strict` mode, to check for tree objects with `g+w` permissions
+    * [ ] consider reflog entries as reference nodes/heads
+    * [ ] when reporting reachable objects, include _how_ they are reachable
+        * [ ] which reference node(s) made them reachable
+        * [ ] parent commit(s)
+
 ### gix-ref
 * [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.
 * **Stores**
diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml
index 3312c6724dc..aec7996081b 100644
--- a/gitoxide-core/Cargo.toml
+++ b/gitoxide-core/Cargo.toml
@@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
 gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
 gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
 gix-status = { version = "^0.2.0", path = "../gix-status" }
+gix-fsck = { version = "^0.0.0", path = "../gix-fsck" }
 serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
 anyhow = "1.0.42"
 thiserror = "1.0.34"
diff --git a/gitoxide-core/src/repository/fsck.rs b/gitoxide-core/src/repository/fsck.rs
new file mode 100644
index 00000000000..1628e62adb7
--- /dev/null
+++ b/gitoxide-core/src/repository/fsck.rs
@@ -0,0 +1,40 @@
+use std::io::{BufWriter, Write};
+
+use anyhow::Context;
+use gix::{objs::Kind, ObjectId};
+
+pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> {
+    let mut out = BufWriter::with_capacity(64 * 1024, out);
+    let spec = spec.unwrap_or("HEAD".into());
+
+    repo.object_cache_size_if_unset(4 * 1024 * 1024);
+    // We expect to be finding a bunch of non-existent objects here - never refresh the ODB
+    repo.objects.refresh_never();
+
+    let id = repo
+        .rev_parse_single(spec.as_str())
+        .context("Only single revisions are supported")?;
+    let commits: gix::revision::Walk<'_> = id
+        .object()?
+        .peel_to_kind(gix::object::Kind::Commit)
+        .context("Need commitish as starting point")?
+        .id()
+        .ancestors()
+        .all()?;
+
+    let missing_cb = |oid: &ObjectId, kind: Kind| {
+        writeln!(out, "{oid}: {kind}").expect("failed to write output");
+    };
+    let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb);
+
+    // Walk all commits, checking each one for connectivity
+    for commit in commits {
+        let commit = commit?;
+        conn.check_commit(&commit.id);
+        for parent in commit.parent_ids {
+            conn.check_commit(&parent);
+        }
+    }
+
+    Ok(())
+}
diff --git a/gitoxide-core/src/repository/mod.rs b/gitoxide-core/src/repository/mod.rs
index c78e82edb85..4b97259b71a 100644
--- a/gitoxide-core/src/repository/mod.rs
+++ b/gitoxide-core/src/repository/mod.rs
@@ -35,6 +35,7 @@ pub use clone::function::clone;
 pub use fetch::function::fetch;
 
 pub mod commitgraph;
+pub mod fsck;
 pub mod index;
 pub mod mailmap;
 pub mod odb;
diff --git a/gix-fsck/Cargo.toml b/gix-fsck/Cargo.toml
new file mode 100644
index 00000000000..172c8cdb35f
--- /dev/null
+++ b/gix-fsck/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "gix-fsck"
+version = "0.0.0"
+repository = "https://github.com/Byron/gitoxide"
+authors = ["Cameron Esfahani <cesfahani@gmail.com>", "Sebastian Thiel <sebastian.thiel@icloud.com>"]
+license = "MIT OR Apache-2.0"
+description = "Verifies the connectivity and validity of objects in the database"
+edition = "2021"
+include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"]
+rust-version = "1.65"
+
+[lib]
+doctest = false
+
+[dependencies]
+gix-hash = { version = "^0.13.1", path = "../gix-hash" }
+gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" }
+gix-object = { version = "^0.38.0", path = "../gix-object" }
+
+[dev-dependencies]
+gix-odb = { path = "../gix-odb" }
+gix-testtools = { path = "../tests/tools"}
diff --git a/gix-fsck/Changelog.md b/gix-fsck/Changelog.md
new file mode 100644
index 00000000000..1d013ff92fa
--- /dev/null
+++ b/gix-fsck/Changelog.md
@@ -0,0 +1,6 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
diff --git a/gix-fsck/LICENSE-APACHE b/gix-fsck/LICENSE-APACHE
new file mode 120000
index 00000000000..965b606f331
--- /dev/null
+++ b/gix-fsck/LICENSE-APACHE
@@ -0,0 +1 @@
+../LICENSE-APACHE
\ No newline at end of file
diff --git a/gix-fsck/LICENSE-MIT b/gix-fsck/LICENSE-MIT
new file mode 120000
index 00000000000..76219eb72e8
--- /dev/null
+++ b/gix-fsck/LICENSE-MIT
@@ -0,0 +1 @@
+../LICENSE-MIT
\ No newline at end of file
diff --git a/gix-fsck/src/lib.rs b/gix-fsck/src/lib.rs
new file mode 100644
index 00000000000..6583697fc40
--- /dev/null
+++ b/gix-fsck/src/lib.rs
@@ -0,0 +1,127 @@
+//! A library for performing object database integrity and connectivity checks
+#![deny(rust_2018_idioms)]
+
+use gix_hash::ObjectId;
+use gix_hashtable::HashSet;
+use gix_object::{tree::EntryMode, Exists, FindExt, Kind};
+
+pub struct ConnectivityCheck<'a, T, F>
+where
+    T: FindExt + Exists,
+    F: FnMut(&ObjectId, Kind),
+{
+    /// ODB handle to use for the check
+    db: &'a T,
+    /// Closure to invoke when a missing object is encountered
+    missing_cb: F,
+    /// Set of Object IDs already (or about to be) scanned during the check
+    oid_set: HashSet,
+    /// Single buffer for decoding objects from the ODB
+    /// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements)
+    buf: Vec<u8>,
+}
+
+impl<'a, T, F> ConnectivityCheck<'a, T, F>
+where
+    T: FindExt + Exists,
+    F: FnMut(&ObjectId, Kind),
+{
+    /// Instantiate a connectivity check
+    pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> {
+        ConnectivityCheck {
+            db,
+            missing_cb,
+            oid_set: HashSet::default(),
+            buf: Vec::new(),
+        }
+    }
+
+    /// Run the connectivity check on the provided commit object ID
+    /// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB
+    /// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped
+    /// - Any referenced blobs that are not present in the ODB will result in a call to the  `missing_cb`
+    /// - Missing commits or trees will currently result in panic
+    ///     - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
+    pub fn check_commit(&mut self, oid: &ObjectId) {
+        // Attempt to insert the commit ID in the set, and if already present, return immediately
+        if !self.oid_set.insert(*oid) {
+            return;
+        }
+        // Obtain the commit's tree ID
+        let tree_id = {
+            let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit");
+            commit.tree()
+        };
+
+        // Attempt to insert the tree ID in the set, and if already present, return immediately
+        if self.oid_set.insert(tree_id) {
+            self.check_tree(&tree_id);
+        }
+    }
+
+    fn check_tree(&mut self, oid: &ObjectId) {
+        let tree = match self.db.find_tree(oid, &mut self.buf) {
+            Ok(tree) => tree,
+            Err(_) => {
+                // Tree is missing, so invoke `missing_cb`
+                (self.missing_cb)(oid, Kind::Tree);
+                return;
+            }
+        };
+
+        // Keeping separate sets for trees and blobs for now...
+        // This is about a wash when compared to using a HashMap<ObjectID, Kind>
+        struct TreeEntries {
+            trees: HashSet<ObjectId>,
+            blobs: HashSet<ObjectId>,
+        }
+
+        // Build up a set of trees and a set of blobs
+        let entries: TreeEntries = {
+            let mut entries = TreeEntries {
+                trees: HashSet::default(),
+                blobs: HashSet::default(),
+            };
+
+            // For each entry in the tree
+            for entry_ref in tree.entries.iter() {
+                match entry_ref.mode {
+                    EntryMode::Tree => {
+                        let tree_id = entry_ref.oid.to_owned();
+                        // Check if the tree has already been encountered
+                        if self.oid_set.insert(tree_id) {
+                            entries.trees.insert(tree_id);
+                        }
+                    }
+                    EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
+                        let blob_id = entry_ref.oid.to_owned();
+                        // Check if the blob has already been encountered
+                        if self.oid_set.insert(blob_id) {
+                            entries.blobs.insert(blob_id);
+                        }
+                    }
+                    EntryMode::Commit => {
+                        // This implies a submodule (OID is the commit hash of the submodule)
+                        // Skip it as it's not in this repository!
+                    }
+                }
+            }
+            entries
+        };
+
+        for tree_id in entries.trees.iter() {
+            self.check_tree(tree_id);
+        }
+        for blob_id in entries.blobs.iter() {
+            self.check_blob(blob_id);
+        }
+    }
+
+    fn check_blob(&mut self, oid: &ObjectId) {
+        // Check if the blob is missing from the ODB
+        if !self.db.exists(oid) {
+            // Blob is missing, so invoke `missing_cb`
+            (self.missing_cb)(oid, Kind::Blob);
+        }
+    }
+}
diff --git a/gix-fsck/tests/connectivity/mod.rs b/gix-fsck/tests/connectivity/mod.rs
new file mode 100644
index 00000000000..4625fbf681e
--- /dev/null
+++ b/gix-fsck/tests/connectivity/mod.rs
@@ -0,0 +1,84 @@
+use std::ops::Deref;
+
+use gix_fsck::ConnectivityCheck;
+use gix_hash::ObjectId;
+use gix_hashtable::HashMap;
+use gix_object::Kind;
+use gix_testtools::once_cell::sync::Lazy;
+
+use crate::hex_to_id;
+
+fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator<Item = &'a ObjectId>) -> HashMap<ObjectId, Kind> {
+    let db = {
+        let fixture_path = gix_testtools::scripted_fixture_read_only("make_test_repos.sh")
+            .expect("fixture path")
+            .join(repo_name)
+            .join(".git")
+            .join("objects");
+        let mut db = gix_odb::at(fixture_path).expect("odb handle");
+        db.refresh_never();
+        db
+    };
+
+    let mut missing: HashMap<ObjectId, Kind> = HashMap::default();
+    let callback = |oid: &ObjectId, kind: Kind| {
+        missing.try_insert(*oid, kind).expect("no duplicate oid");
+    };
+
+    let mut check = ConnectivityCheck::new(&db, callback);
+    for commit in commits.into_iter() {
+        check.check_commit(commit);
+    }
+
+    missing
+}
+
+fn hex_to_ids<'a>(hex_ids: impl IntoIterator<Item = &'a str>) -> Vec<ObjectId> {
+    hex_ids.into_iter().map(hex_to_id).collect()
+}
+
+fn hex_to_objects<'a>(hex_ids: impl IntoIterator<Item = &'a str>, kind: Kind) -> HashMap<ObjectId, Kind> {
+    hex_to_ids(hex_ids)
+        .into_iter()
+        .map(|id| (id, kind))
+        .collect::<HashMap<_, _>>()
+}
+
+// Get a `&Vec<ObjectID` for each commit in the test fixture repository
+fn all_commits<'a>() -> &'a Vec<ObjectId> {
+    static ALL_COMMITS: Lazy<Vec<ObjectId>> = Lazy::new(|| {
+        hex_to_ids(vec![
+            "5d18db2e2aabadf7b914435ef34f2faf8b4546dd",
+            "3a3dfaa55a515f3fb3a25751107bbb523af6a1b0",
+            "734c926856a328d1168ffd7088532e0d1ad19bbe",
+        ])
+    });
+    ALL_COMMITS.deref()
+}
+
+#[test]
+fn no_missing() {
+    // The "base" repo is the original, and has every object present
+    assert_eq!(check_missing("base", all_commits()), HashMap::default());
+}
+
+#[test]
+fn missing_blobs() {
+    // The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob
+    let expected = hex_to_objects(vec!["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob);
+    assert_eq!(check_missing("blobless", all_commits()), expected);
+}
+
+#[test]
+fn missing_trees() {
+    // The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees
+    // NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing
+    let expected = hex_to_objects(
+        vec![
+            "9561cfbae43c5e2accdfcd423378588dd10d827f",
+            "fc264b3b6875a46e9031483aeb9994a1b897ffd3",
+        ],
+        Kind::Tree,
+    );
+    assert_eq!(check_missing("treeless", all_commits()), expected);
+}
diff --git a/gix-fsck/tests/fixtures/generated-archives/.gitignore b/gix-fsck/tests/fixtures/generated-archives/.gitignore
new file mode 100644
index 00000000000..d47a40f0345
--- /dev/null
+++ b/gix-fsck/tests/fixtures/generated-archives/.gitignore
@@ -0,0 +1 @@
+make_test_repos.tar.xz
diff --git a/gix-fsck/tests/fixtures/make_test_repos.sh b/gix-fsck/tests/fixtures/make_test_repos.sh
new file mode 100755
index 00000000000..19491a0c551
--- /dev/null
+++ b/gix-fsck/tests/fixtures/make_test_repos.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -x
+set -euo pipefail
+
+# We override the global config with our own local one (see below)
+export GIT_CONFIG_GLOBAL="$PWD/.gitconfig"
+
+# We need to be able to do partial clones, so enable it
+# - needs to be present in the persistent gitconfig, as a clone with `--no-local`
+git config --global uploadpack.allowFilter true
+
+# First build out a base repository
+git init base
+(
+    cd base
+
+    echo "blob 1" > blob-1
+    git add -A
+    git commit -m "commit 1"
+    echo "blob-2" > blob-2
+    git add -A
+    git commit -m "commit 2"
+    git rm blob-1
+    git add -A
+    git commit -m "commit 3"
+)
+
+# Blobless clone
+git clone --no-local --no-hardlinks --filter=blob:none ./base blobless
+
+# Treeless (and blobless) clone
+git clone --no-local --no-hardlinks --filter=tree:0 ./base treeless
diff --git a/gix-fsck/tests/fsck.rs b/gix-fsck/tests/fsck.rs
new file mode 100644
index 00000000000..f12ebfebc25
--- /dev/null
+++ b/gix-fsck/tests/fsck.rs
@@ -0,0 +1,8 @@
+use gix_hash::ObjectId;
+pub use gix_testtools::Result;
+
+pub fn hex_to_id(hex: &str) -> ObjectId {
+    ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex")
+}
+
+mod connectivity;
diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs
index f36d0494b26..670ab1b391c 100644
--- a/src/plumbing/main.rs
+++ b/src/plumbing/main.rs
@@ -15,8 +15,8 @@ use gix::bstr::{io::BufReadExt, BString};
 
 use crate::plumbing::{
     options::{
-        attributes, commit, commitgraph, config, credential, exclude, free, index, mailmap, odb, revision, tree, Args,
-        Subcommands,
+        attributes, commit, commitgraph, config, credential, exclude, free, fsck, index, mailmap, odb, revision, tree,
+        Args, Subcommands,
     },
     show_progress,
 };
@@ -1072,6 +1072,17 @@ pub fn main() -> Result<()> {
                 move |_progress, out, err| core::repository::odb::info(repository(Mode::Strict)?, format, out, err),
             ),
         },
+        Subcommands::Fsck(cmd) => match cmd {
+            fsck::Subcommands::Connectivity { spec } => prepare_and_run(
+                "fsck-connectivity",
+                trace,
+                verbose,
+                progress,
+                progress_keep_open,
+                None,
+                move |_progress, out, _err| core::repository::fsck::connectivity(repository(Mode::Strict)?, spec, out),
+            ),
+        },
         Subcommands::Mailmap(cmd) => match cmd {
             mailmap::Subcommands::Entries => prepare_and_run(
                 "mailmap-entries",
diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs
index f7b484aaf6f..1eb84ea8400 100644
--- a/src/plumbing/options/mod.rs
+++ b/src/plumbing/options/mod.rs
@@ -87,6 +87,9 @@ pub enum Subcommands {
     /// Interact with the object database.
     #[clap(subcommand)]
     Odb(odb::Subcommands),
+    /// Perform integrity checks on the repository.
+    #[clap(subcommand)]
+    Fsck(fsck::Subcommands),
     /// Interact with tree objects.
     #[clap(subcommand)]
     Tree(tree::Subcommands),
@@ -489,6 +492,17 @@ pub mod odb {
     }
 }
 
+pub mod fsck {
+    #[derive(Debug, clap::Subcommand)]
+    pub enum Subcommands {
+        /// Perform a connectivity check on the repository.
+        Connectivity {
+            /// A revspec to start the connectivity check from.
+            spec: Option<String>,
+        },
+    }
+}
+
 pub mod tree {
     #[derive(Debug, clap::Subcommand)]
     pub enum Subcommands {