From a30eebf355c3cd0b17a04f23cb318fb459983fa9 Mon Sep 17 00:00:00 2001 From: Jacob Finkelman Date: Mon, 7 Nov 2022 16:43:49 +0000 Subject: [PATCH 1/2] Add a nonstandard shallow clone for GitHub --- src/cargo/core/source/source_id.rs | 5 +- src/cargo/sources/git/source.rs | 2 +- src/cargo/sources/git/utils.rs | 214 ++++++++++++++++++++++----- src/cargo/sources/registry/remote.rs | 20 +-- 4 files changed, 182 insertions(+), 59 deletions(-) diff --git a/src/cargo/core/source/source_id.rs b/src/cargo/core/source/source_id.rs index bb43843b5c0..cefa3c796e9 100644 --- a/src/cargo/core/source/source_id.rs +++ b/src/cargo/core/source/source_id.rs @@ -4,8 +4,7 @@ use crate::sources::{DirectorySource, CRATES_IO_DOMAIN, CRATES_IO_INDEX, CRATES_ use crate::sources::{GitSource, PathSource, RegistrySource}; use crate::util::{config, CanonicalUrl, CargoResult, Config, IntoUrl}; use log::trace; -use serde::de; -use serde::ser; +use serde::{de, ser, Serialize}; use std::cmp::{self, Ordering}; use std::collections::HashSet; use std::fmt::{self, Formatter}; @@ -64,7 +63,7 @@ enum SourceKind { } /// Information to find a specific commit in a Git repository. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] pub enum GitReference { /// From a tag. Tag(String), diff --git a/src/cargo/sources/git/source.rs b/src/cargo/sources/git/source.rs index d09d5271627..4b383f8fcf3 100644 --- a/src/cargo/sources/git/source.rs +++ b/src/cargo/sources/git/source.rs @@ -144,7 +144,7 @@ impl<'cfg> Source for GitSource<'cfg> { // database, then try to resolve our reference with the preexisting // repository. (None, Some(db)) if self.config.offline() => { - let rev = db.resolve(&self.manifest_reference).with_context(|| { + let rev = db.resolve_to_object(&self.manifest_reference).with_context(|| { "failed to lookup reference in preexisting repository, and \ can't check for updates in offline mode (--offline)" })?; diff --git a/src/cargo/sources/git/utils.rs b/src/cargo/sources/git/utils.rs index 623caceb803..8aab435f539 100644 --- a/src/cargo/sources/git/utils.rs +++ b/src/cargo/sources/git/utils.rs @@ -9,14 +9,13 @@ use cargo_util::{paths, ProcessBuilder}; use curl::easy::List; use git2::{self, ErrorClass, ObjectType, Oid}; use log::{debug, info}; -use serde::ser; -use serde::Serialize; +use serde::{ser, Deserialize, Serialize}; use std::borrow::Cow; use std::env; use std::fmt; use std::path::{Path, PathBuf}; use std::process::Command; -use std::str; +use std::str::{self, FromStr}; use std::time::{Duration, Instant}; use url::Url; @@ -28,6 +27,17 @@ where s.collect_str(t) } +fn deserialize_str<'de, D, T>(deserializer: D) -> Result +where + T: FromStr, + ::Err: fmt::Display, + D: serde::Deserializer<'de>, +{ + let buf = String::deserialize(deserializer)?; + + FromStr::from_str(&buf).map_err(serde::de::Error::custom) +} + pub struct GitShortID(git2::Buf); impl GitShortID { @@ -78,8 +88,25 @@ impl GitRemote { &self.url } - pub fn rev_for(&self, path: &Path, reference: &GitReference) -> CargoResult { - reference.resolve(&self.db_at(path)?.repo) + /// Finds the Oid associated with the reference. The result is guaranteed to be on disk. + /// But may not be the object the reference points to! + /// For example, the reference points to a Commit and this may return the Tree that commit points do. + pub fn rev_to_object_for( + &self, + path: &Path, + reference: &GitReference, + ) -> CargoResult { + reference.resolve_to_object(&self.db_at(path)?.repo) + } + + /// Finds the Oid of the Commit the reference points to. But the result may not be on disk! + /// For example, the reference points to a Commit and we have only cloned the Tree. + pub fn rev_to_commit_for( + &self, + path: &Path, + reference: &GitReference, + ) -> CargoResult { + reference.resolve_to_commit(&self.db_at(path)?.repo) } pub fn checkout( @@ -104,7 +131,7 @@ impl GitRemote { } } None => { - if let Ok(rev) = reference.resolve(&db.repo) { + if let Ok(rev) = reference.resolve_to_object(&db.repo) { return Ok((db, rev)); } } @@ -123,7 +150,7 @@ impl GitRemote { .context(format!("failed to clone into: {}", into.display()))?; let rev = match locked_rev { Some(rev) => rev, - None => reference.resolve(&repo)?, + None => reference.resolve_to_object(&repo)?, }; Ok(( @@ -179,13 +206,65 @@ impl GitDatabase { self.repo.revparse_single(&oid.to_string()).is_ok() } - pub fn resolve(&self, r: &GitReference) -> CargoResult { - r.resolve(&self.repo) + pub fn resolve_to_object(&self, r: &GitReference) -> CargoResult { + r.resolve_to_object(&self.repo) } } +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +struct ShallowDataBlob { + #[serde(serialize_with = "serialize_str")] + #[serde(deserialize_with = "deserialize_str")] + tree: git2::Oid, + #[serde(serialize_with = "serialize_str")] + #[serde(deserialize_with = "deserialize_str")] + etag: git2::Oid, +} + +#[test] +fn check_with_git_hub() { + panic!( + r#"nonstandard shallow clone may be worse than a full check out. + This test is here to make sure we do not merge until we have official signoff from GitHub"# + ) +} + impl GitReference { - pub fn resolve(&self, repo: &git2::Repository) -> CargoResult { + /// Finds the Oid associated with the reference. The result is guaranteed to be on disk. + /// But may not be the object the reference points to! + /// For example, the reference points to a Commit and this may return the Tree that commit points do. + pub fn resolve_to_object(&self, repo: &git2::Repository) -> CargoResult { + // Check if Cargo has done a nonstandard shallow clone + if let Some(shallow_data) = self.find_shallow_blob(repo) { + Ok(shallow_data.tree) + } else { + self.resolve_by_git(repo) + } + } + /// Finds the Oid of the Commit the reference points to. But the result may not be on disk! + /// For example, the reference points to a Commit and we have only cloned the Tree. + pub fn resolve_to_commit(&self, repo: &git2::Repository) -> CargoResult { + // Check if Cargo has done a nonstandard shallow clone + if let Some(shallow_data) = self.find_shallow_blob(repo) { + return Ok(shallow_data.etag); + } else { + self.resolve_by_git(repo) + } + } + + fn find_shallow_blob(&self, repo: &git2::Repository) -> Option { + repo.find_reference( + &(format!( + "refs/cargo-{}", + serde_json::to_string(self).expect("why cant we make json of this") + )), + ) + .ok() + .and_then(|re| re.peel_to_blob().ok()) + .and_then(|blob| serde_json::from_slice(blob.content()).ok()) + } + + fn resolve_by_git(&self, repo: &git2::Repository) -> CargoResult { let id = match self { // Note that we resolve the named tag here in sync with where it's // fetched into via `fetch` below. @@ -707,10 +786,17 @@ fn reset(repo: &git2::Repository, obj: &git2::Object<'_>, config: &Config) -> Ca opts.progress(|_, cur, max| { drop(pb.tick(cur, max, "")); }); - debug!("doing reset"); - repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?; - debug!("reset done"); - Ok(()) + if obj.as_tree().is_some() { + debug!("doing reset for Cargo nonstandard shallow clone"); + repo.checkout_tree(obj, Some(&mut opts))?; + debug!("reset done"); + Ok(()) + } else { + debug!("doing reset"); + repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?; + debug!("reset done"); + Ok(()) + } } pub fn with_fetch_options( @@ -816,32 +902,44 @@ pub fn fetch( // The `+` symbol on the refspec means to allow a forced (fast-forward) // update which is needed if there is ever a force push that requires a // fast-forward. - match reference { - // For branches and tags we can fetch simply one reference and copy it - // locally, no need to fetch other branches/tags. - GitReference::Branch(b) => { - refspecs.push(format!("+refs/heads/{0}:refs/remotes/origin/{0}", b)); - } - GitReference::Tag(t) => { - refspecs.push(format!("+refs/tags/{0}:refs/remotes/origin/tags/{0}", t)); + if let Some(oid_to_fetch) = oid_to_fetch { + // GitHub told us exactly the min needed to fetch. So we can go ahead and do a Cargo nonstandard shallow clone. + refspecs.push(format!("+{0}", oid_to_fetch)); + } else { + // In some cases we have Cargo nonstandard shallow cloned this repo before, but cannot do it now. + // Mostly if GitHub is now rate limiting us. If so, remove the info about the shallow clone. + if let Ok(mut refe) = repo.find_reference(&format!( + "refs/cargo-{}", + serde_json::to_string(reference).expect("why cant we make json of this") + )) { + let _ = refe.delete(); } - GitReference::DefaultBranch => { - refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); - } + match reference { + // For branches and tags we can fetch simply one reference and copy it + // locally, no need to fetch other branches/tags. + GitReference::Branch(b) => { + refspecs.push(format!("+refs/heads/{0}:refs/remotes/origin/{0}", b)); + } + GitReference::Tag(t) => { + refspecs.push(format!("+refs/tags/{0}:refs/remotes/origin/tags/{0}", t)); + } - GitReference::Rev(rev) => { - if rev.starts_with("refs/") { - refspecs.push(format!("+{0}:{0}", rev)); - } else if let Some(oid_to_fetch) = oid_to_fetch { - refspecs.push(format!("+{0}:refs/commit/{0}", oid_to_fetch)); - } else { - // We don't know what the rev will point to. To handle this - // situation we fetch all branches and tags, and then we pray - // it's somewhere in there. - refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*")); + GitReference::DefaultBranch => { refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); - tags = true; + } + + GitReference::Rev(rev) => { + if rev.starts_with("refs/") { + refspecs.push(format!("+{0}:{0}", rev)); + } else { + // We don't know what the rev will point to. To handle this + // situation we fetch all branches and tags, and then we pray + // it's somewhere in there. + refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*")); + refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); + tags = true; + } } } } @@ -1105,7 +1203,7 @@ fn github_fast_path( return Ok(FastPathRev::Indeterminate); } - let local_object = reference.resolve(repo).ok(); + let local_object = reference.resolve_to_commit(repo).ok(); let github_branch_name = match reference { GitReference::Branch(branch) => branch, @@ -1175,7 +1273,7 @@ fn github_fast_path( handle.useragent("cargo")?; handle.http_headers({ let mut headers = List::new(); - headers.append("Accept: application/vnd.github.3.sha")?; + headers.append("Accept: application/vnd.github+json")?; if let Some(local_object) = local_object { headers.append(&format!("If-None-Match: \"{}\"", local_object))?; } @@ -1195,8 +1293,44 @@ fn github_fast_path( if response_code == 304 { Ok(FastPathRev::UpToDate) } else if response_code == 200 { - let oid_to_fetch = str::from_utf8(&response_body)?.parse::()?; - Ok(FastPathRev::NeedsFetch(oid_to_fetch)) + #[derive(Debug, Deserialize)] + struct GithubFastPathJsonResponse { + #[serde(serialize_with = "serialize_str")] + #[serde(deserialize_with = "deserialize_str")] + sha: git2::Oid, + commit: GithubCommitJsonResponse, + } + + #[derive(Debug, Deserialize)] + struct GithubCommitJsonResponse { + tree: GithubTreeJsonResponse, + } + + #[derive(Debug, Deserialize)] + struct GithubTreeJsonResponse { + #[serde(serialize_with = "serialize_str")] + #[serde(deserialize_with = "deserialize_str")] + sha: git2::Oid, + } + + let data: GithubFastPathJsonResponse = serde_json::from_slice(&response_body)?; + // We can do a Cargo nonstandard shallow clone, so record the relevant information. + let bytes = serde_json::to_string(&ShallowDataBlob { + tree: data.commit.tree.sha, + etag: data.sha, + }) + .expect("why cant we make json of this"); + let shallow_blob = repo.blob(bytes.as_bytes())?; + repo.reference( + &format!( + "refs/cargo-{}", + serde_json::to_string(reference).expect("why cant we make json of this") + ), + shallow_blob, + true, + "", + )?; + Ok(FastPathRev::NeedsFetch(data.commit.tree.sha)) } else { // Usually response_code == 404 if the repository does not exist, and // response_code == 422 if exists but GitHub is unable to resolve the diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index b9283b819e6..8f9e963c5ac 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -29,7 +29,6 @@ pub struct RemoteRegistry<'cfg> { config: &'cfg Config, tree: RefCell>>, repo: LazyCell, - head: Cell>, current_sha: Cell>, needs_update: bool, // Does this registry need to be updated? } @@ -45,7 +44,6 @@ impl<'cfg> RemoteRegistry<'cfg> { index_git_ref: GitReference::DefaultBranch, tree: RefCell::new(None), repo: LazyCell::new(), - head: Cell::new(None), current_sha: Cell::new(None), needs_update: false, } @@ -94,15 +92,6 @@ impl<'cfg> RemoteRegistry<'cfg> { }) } - fn head(&self) -> CargoResult { - if self.head.get().is_none() { - let repo = self.repo()?; - let oid = self.index_git_ref.resolve(repo)?; - self.head.set(Some(oid)); - } - Ok(self.head.get().unwrap()) - } - fn tree(&self) -> CargoResult>> { { let tree = self.tree.borrow(); @@ -111,8 +100,8 @@ impl<'cfg> RemoteRegistry<'cfg> { } } let repo = self.repo()?; - let commit = repo.find_commit(self.head()?)?; - let tree = commit.tree()?; + let oid = self.index_git_ref.resolve_to_object(repo)?; + let tree = repo.find_object(oid, None)?.peel_to_tree()?; // Unfortunately in libgit2 the tree objects look like they've got a // reference to the repository object which means that a tree cannot @@ -135,7 +124,9 @@ impl<'cfg> RemoteRegistry<'cfg> { if let Some(sha) = self.current_sha.get() { return Some(sha); } - let sha = InternedString::new(&self.head().ok()?.to_string()); + let repo = self.repo().ok()?; + let oid = self.index_git_ref.resolve_to_commit(repo).ok()?; + let sha = InternedString::new(&oid.to_string()); self.current_sha.set(Some(sha)); Some(sha) } @@ -284,7 +275,6 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { self.config.http()?; self.prepare()?; - self.head.set(None); *self.tree.borrow_mut() = None; self.current_sha.set(None); let path = self.config.assert_package_cache_locked(&self.index_path); From ca588b96c7bec81d431896949d8b91658f1b1479 Mon Sep 17 00:00:00 2001 From: Jacob Finkelman Date: Mon, 7 Nov 2022 17:08:00 +0000 Subject: [PATCH 2/2] tell `git gc` that we care about the tree --- src/cargo/sources/git/utils.rs | 62 +++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/src/cargo/sources/git/utils.rs b/src/cargo/sources/git/utils.rs index 8aab435f539..0aaa776150b 100644 --- a/src/cargo/sources/git/utils.rs +++ b/src/cargo/sources/git/utils.rs @@ -253,15 +253,22 @@ impl GitReference { } fn find_shallow_blob(&self, repo: &git2::Repository) -> Option { - repo.find_reference( - &(format!( - "refs/cargo-{}", - serde_json::to_string(self).expect("why cant we make json of this") - )), - ) - .ok() - .and_then(|re| re.peel_to_blob().ok()) - .and_then(|blob| serde_json::from_slice(blob.content()).ok()) + let mut name = format!( + "refs/cargo-{}", + serde_json::to_string(self).expect("why cant we make json of this") + ); + let shallow_blob = repo + .find_reference(&name) + .ok() + .and_then(|re| re.peel_to_blob().ok()) + .and_then(|blob| serde_json::from_slice(blob.content()).ok())?; + // A nonstandard shallow clone should also have a reference to keep `git gc` from removing the tree. + name = name + "Tree"; + let tree = repo.find_reference(&name).ok()?; + if tree.target()? != shallow_blob.tree { + return None; + } + shallow_blob } fn resolve_by_git(&self, repo: &git2::Repository) -> CargoResult { @@ -904,14 +911,20 @@ pub fn fetch( // fast-forward. if let Some(oid_to_fetch) = oid_to_fetch { // GitHub told us exactly the min needed to fetch. So we can go ahead and do a Cargo nonstandard shallow clone. - refspecs.push(format!("+{0}", oid_to_fetch)); + refspecs.push(oid_to_fetch); } else { // In some cases we have Cargo nonstandard shallow cloned this repo before, but cannot do it now. // Mostly if GitHub is now rate limiting us. If so, remove the info about the shallow clone. - if let Ok(mut refe) = repo.find_reference(&format!( + let mut name = format!( "refs/cargo-{}", serde_json::to_string(reference).expect("why cant we make json of this") - )) { + ); + if let Ok(mut refe) = repo.find_reference(&name) { + let _ = refe.delete(); + } + // And remove the reference that keeps `git gc` from cleaning the tree. + name = name + "Tree"; + if let Ok(mut refe) = repo.find_reference(&name) { let _ = refe.delete(); } @@ -1170,9 +1183,9 @@ enum FastPathRev { /// The local rev (determined by `reference.resolve(repo)`) is already up to /// date with what this rev resolves to on GitHub's server. UpToDate, - /// The following SHA must be fetched in order for the local rev to become + /// The following refspec must be fetched in order for the local rev to become /// up to date. - NeedsFetch(Oid), + NeedsFetch(String), /// Don't know whether local rev is up to date. We'll fetch _all_ branches /// and tags from the server and see what happens. Indeterminate, @@ -1321,16 +1334,17 @@ fn github_fast_path( }) .expect("why cant we make json of this"); let shallow_blob = repo.blob(bytes.as_bytes())?; - repo.reference( - &format!( - "refs/cargo-{}", - serde_json::to_string(reference).expect("why cant we make json of this") - ), - shallow_blob, - true, - "", - )?; - Ok(FastPathRev::NeedsFetch(data.commit.tree.sha)) + let mut name = format!( + "refs/cargo-{}", + serde_json::to_string(reference).expect("why cant we make json of this") + ); + repo.reference(&name, shallow_blob, true, "")?; + // Make a refspec to fetch the tree derectly, with a reference so that `git gc` does not clean it up. + Ok(FastPathRev::NeedsFetch(format!( + "+{}:{}Tree", + data.commit.tree.sha, + name + ))) } else { // Usually response_code == 404 if the repository does not exist, and // response_code == 422 if exists but GitHub is unable to resolve the