Skip to content

Commit

Permalink
Handle large multi-pack indices correctly (#279)
Browse files Browse the repository at this point in the history
  • Loading branch information
Byron committed Jan 1, 2022
1 parent ba92cc0 commit 4f6b030
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 26 deletions.
13 changes: 8 additions & 5 deletions git-pack/src/multi_index/access.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,14 @@ impl File {
let offset = &self.data[start + 4..][..4];
let ofs32 = BigEndian::read_u32(offset);
let pack_offset = if (ofs32 & HIGH_BIT) == HIGH_BIT {
let offsets_64 = self
.large_offsets_ofs
.expect("non-malformed file that has large offsets if these are contained");
let from = offsets_64 + (ofs32 ^ HIGH_BIT) as usize * 8;
BigEndian::read_u64(&self.data[from..][..8])
// We determine if large offsets are actually larger than 4GB and if not, we don't use the high-bit to signal anything
// but allow the presence of the large-offset chunk to signal what's happening.
if let Some(offsets_64) = self.large_offsets_ofs {
let from = offsets_64 + (ofs32 ^ HIGH_BIT) as usize * 8;
BigEndian::read_u64(&self.data[from..][..8])
} else {
ofs32 as u64
}
} else {
ofs32 as u64
};
Expand Down
38 changes: 28 additions & 10 deletions git-pack/src/multi_index/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ pub mod lookup {

/// Information about the offsets table.
pub mod offsets {
use std::convert::TryInto;
use std::ops::Range;

use byteorder::{BigEndian, WriteBytesExt};
Expand All @@ -192,6 +193,7 @@ pub mod offsets {

pub(crate) fn write(
sorted_entries: &[multi_index::write::Entry],
large_offsets_needed: bool,
mut out: impl std::io::Write,
) -> std::io::Result<()> {
use crate::index::write::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD};
Expand All @@ -200,12 +202,19 @@ pub mod offsets {
for entry in sorted_entries {
out.write_u32::<BigEndian>(entry.pack_index)?;

let offset = if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
let res = num_large_offsets | HIGH_BIT;
num_large_offsets += 1;
res
let offset: u32 = if large_offsets_needed {
if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
let res = num_large_offsets | HIGH_BIT;
num_large_offsets += 1;
res
} else {
entry.pack_offset as u32
}
} else {
entry.pack_offset as u32
entry
.pack_offset
.try_into()
.expect("without large offsets, pack-offset fits u32")
};
out.write_u32::<BigEndian>(offset)?;
}
Expand All @@ -230,11 +239,20 @@ pub mod large_offsets {
/// The id uniquely identifying the large offsets table (with 64 bit offsets)
pub const ID: git_chunk::Id = *b"LOFF";

pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> usize {
entries
.iter()
.filter(|e| e.pack_offset > LARGE_OFFSET_THRESHOLD)
.count()
/// Returns Some(num-large-offset) if there are offsets larger than u32.
pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> Option<usize> {
let mut num_large_offsets = 0;
let mut needs_large_offsets = false;
for entry in entries {
if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
num_large_offsets += 1;
}
if entry.pack_offset > u32::MAX as crate::data::Offset {
needs_large_offsets = true;
}
}

needs_large_offsets.then(|| num_large_offsets)
}
/// Returns true if the `offsets` range seems to be properly aligned for the data we expect.
pub fn is_valid(offset: &Range<usize>) -> bool {
Expand Down
8 changes: 4 additions & 4 deletions git-pack/src/multi_index/verify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub mod integrity {
/// The computed checksum of the multi-index which matched the stored one.
pub actual_index_checksum: git_hash::ObjectId,
/// The for each entry in [`index_names()`][super::File::index_names()] provide the corresponding pack traversal outcome.
pub pack_traverse_outcomes: Vec<crate::index::traverse::Statistics>,
pub pack_traverse_statistics: Vec<crate::index::traverse::Statistics>,
/// The provided progress instance.
pub progress: P,
}
Expand Down Expand Up @@ -121,7 +121,7 @@ impl File {
return Err(crate::index::traverse::Error::Processor(integrity::Error::Empty));
}

let mut pack_traverse_outcomes = Vec::new();
let mut pack_traverse_statistics = Vec::new();

progress.set_name("Validating");
let start = std::time::Instant::now();
Expand Down Expand Up @@ -245,7 +245,7 @@ impl File {
Interrupted => Interrupted,
}
})?;
pack_traverse_outcomes.push(pack_traverse_outcome);
pack_traverse_statistics.push(pack_traverse_outcome);
}

assert_eq!(
Expand All @@ -257,7 +257,7 @@ impl File {

Ok(integrity::Outcome {
actual_index_checksum,
pack_traverse_outcomes,
pack_traverse_statistics,
progress,
})
}
Expand Down
12 changes: 8 additions & 4 deletions git-pack/src/multi_index/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ impl multi_index::File {
);

let num_large_offsets = multi_index::chunk::large_offsets::num_large_offsets(&entries);
if num_large_offsets > 0 {
if let Some(num_large_offsets) = num_large_offsets {
cf.plan_chunk(
multi_index::chunk::large_offsets::ID,
multi_index::chunk::large_offsets::storage_size(num_large_offsets),
Expand Down Expand Up @@ -179,10 +179,14 @@ impl multi_index::File {
}
multi_index::chunk::fanout::ID => multi_index::chunk::fanout::write(&entries, &mut chunk_write)?,
multi_index::chunk::lookup::ID => multi_index::chunk::lookup::write(&entries, &mut chunk_write)?,
multi_index::chunk::offsets::ID => multi_index::chunk::offsets::write(&entries, &mut chunk_write)?,
multi_index::chunk::large_offsets::ID => {
multi_index::chunk::large_offsets::write(&entries, num_large_offsets, &mut chunk_write)?
multi_index::chunk::offsets::ID => {
multi_index::chunk::offsets::write(&entries, num_large_offsets.is_some(), &mut chunk_write)?
}
multi_index::chunk::large_offsets::ID => multi_index::chunk::large_offsets::write(
&entries,
num_large_offsets.expect("available if planned"),
&mut chunk_write,
)?,
unknown => unreachable!("BUG: forgot to implement chunk {:?}", std::str::from_utf8(&unknown)),
}
progress.inc();
Expand Down
2 changes: 1 addition & 1 deletion git-pack/tests/pack/multi_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ mod verify {
.unwrap();
assert_eq!(outcome.actual_index_checksum, file.checksum());
assert_eq!(
outcome.pack_traverse_outcomes,
outcome.pack_traverse_statistics,
vec![git_pack::index::traverse::Statistics {
average: git_pack::data::decode_entry::Outcome {
kind: git_object::Kind::Tree,
Expand Down
4 changes: 2 additions & 2 deletions gitoxide-core/src/pack/verify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,13 @@ where
let res = multi_index.verify_integrity(cache, progress, should_interrupt, git::odb::pack::multi_index::verify::integrity::Options{verify_mode: mode, traversal: algorithm.into(),thread_limit })?;
match output_statistics {
Some(OutputFormat::Human) => {
for (index_name, stats) in multi_index.index_names().iter().zip(res.pack_traverse_outcomes) {
for (index_name, stats) in multi_index.index_names().iter().zip(res.pack_traverse_statistics) {
writeln!(out, "{}", index_name.display()).ok();
drop(print_statistics(&mut out, &stats));
}
},
#[cfg(feature = "serde1")]
Some(OutputFormat::Json) => serde_json::to_writer_pretty(out, &multi_index.index_names().iter().zip(res.pack_traverse_outcomes).collect::<Vec<_>>())?,
Some(OutputFormat::Json) => serde_json::to_writer_pretty(out, &multi_index.index_names().iter().zip(res.pack_traverse_statistics).collect::<Vec<_>>())?,
_ => {}
};
return Ok(())
Expand Down

0 comments on commit 4f6b030

Please sign in to comment.