Skip to content

rough FSCK for git-odb::Store #290

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions git-hash/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

mod borrowed;

use std::convert::TryFrom;
use std::str::FromStr;
use std::{convert::TryFrom, str::FromStr};

pub use borrowed::oid;

Expand Down
6 changes: 3 additions & 3 deletions git-odb/src/store_impls/dynamic/load_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ impl super::Store {

/// refresh and possibly clear out our existing data structures, causing all pack ids to be invalidated.
/// `load_new_index` is an optimization to at least provide one newly loaded pack after refreshing the slot map.
fn consolidate_with_disk_state(&self, load_new_index: bool) -> Result<Option<Snapshot>, Error> {
pub(crate) fn consolidate_with_disk_state(&self, load_new_index: bool) -> Result<Option<Snapshot>, Error> {
let index = self.index.load();
let previous_index_state = Arc::as_ptr(&index) as usize;

Expand Down Expand Up @@ -372,8 +372,8 @@ impl super::Store {
};
slot.files.store(files);
if !needs_stable_indices {
// Not racy due to lock, generation must be set after unsetting the value AND storing it.
slot.generation.store(0, Ordering::SeqCst);
// Not racy due to lock, generation must be set after unsetting the slot value AND storing it.
slot.generation.store(generation, Ordering::SeqCst);
}
}

Expand Down
3 changes: 3 additions & 0 deletions git-odb/src/store_impls/dynamic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ pub(crate) mod handle;
///
pub mod load_index;

///
pub mod verify;

mod load_one;

mod metrics;
3 changes: 3 additions & 0 deletions git-odb/src/store_impls/dynamic/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ pub(crate) enum OnDiskFileState<T: Clone> {
}

impl<T: Clone> OnDiskFile<T> {
pub fn path(&self) -> &Path {
&*self.path
}
/// Return true if we hold a memory map of the file already.
pub fn is_loaded(&self) -> bool {
matches!(self.state, OnDiskFileState::Loaded(_) | OnDiskFileState::Garbage(_))
Expand Down
215 changes: 215 additions & 0 deletions git-odb/src/store_impls/dynamic/verify.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
use std::{
ops::Deref,
sync::atomic::{AtomicBool, Ordering},
};

use git_features::progress::Progress;

use crate::{
pack,
store::verify::integrity::{IndexStatistics, SingleOrMultiStatistics},
types::IndexAndPacks,
};

///
pub mod integrity {
use std::path::PathBuf;

use crate::pack;

/// Options for use in [`Store::verify_integrity()`][crate::Store::verify_integrity()].
pub type Options<F> = pack::index::verify::integrity::Options<F>;

/// Returned by [`Store::verify_integrity()`][crate::Store::verify_integrity()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error(transparent)]
MultiIndexIntegrity(#[from] pack::index::traverse::Error<pack::multi_index::verify::integrity::Error>),
#[error(transparent)]
IndexIntegrity(#[from] pack::index::traverse::Error<pack::index::verify::integrity::Error>),
#[error(transparent)]
IndexOpen(#[from] pack::index::init::Error),
#[error(transparent)]
LooseObjectStoreIntegrity(#[from] crate::loose::verify::integrity::Error),
#[error(transparent)]
MultiIndexOpen(#[from] pack::multi_index::init::Error),
#[error(transparent)]
PackOpen(#[from] pack::data::init::Error),
#[error(transparent)]
InitializeODB(#[from] crate::store::load_index::Error),
#[error("The disk on state changed while performing the operation, and we observed the change.")]
NeedsRetryDueToChangeOnDisk,
}

#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
/// Integrity information about loose object databases
pub struct LooseObjectStatistics {
/// The path to the root directory of the loose objects database
pub path: PathBuf,
/// The statistics created after verifying the loose object database.
pub statistics: crate::loose::verify::integrity::Statistics,
}

#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
/// Traversal statistics of packs governed by single indices or multi-pack indices.
#[allow(missing_docs)]
pub enum SingleOrMultiStatistics {
Single(pack::index::traverse::Statistics),
Multi(Vec<(PathBuf, pack::index::traverse::Statistics)>),
}

/// Statistics gathered when traversing packs of various kinds of indices.
#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
pub struct IndexStatistics {
/// The path to the index or multi-pack index for which statics were gathered.
pub path: PathBuf,
/// The actual statistics for the index at `path`.
pub statistics: SingleOrMultiStatistics,
}

/// Returned by [`Store::verify_integrity()`][crate::Store::verify_integrity()].
pub struct Outcome<P> {
/// Statistics for validated loose object stores.
pub loose_object_stores: Vec<LooseObjectStatistics>,
/// Pack traversal statistics for each index and their pack(s)
pub index_statistics: Vec<IndexStatistics>,
/// The provided progress instance.
pub progress: P,
}
}

impl super::Store {
/// Check the integrity of all objects as per the given `options`.
///
/// Note that this will not not force loading all indices or packs permanently, as we will only use the momentarily loaded disk state.
/// This does, however, include all alternates.
pub fn verify_integrity<C, P, F>(
&self,
mut progress: P,
should_interrupt: &AtomicBool,
options: integrity::Options<F>,
) -> Result<integrity::Outcome<P>, integrity::Error>
where
P: Progress,
C: pack::cache::DecodeEntry,
F: Fn() -> C + Send + Clone,
{
let mut index = self.index.load();
if !index.is_initialized() {
self.consolidate_with_disk_state(false)?;
index = self.index.load();
assert!(
index.is_initialized(),
"BUG: after consolidating successfully, we have an initialized index"
)
}

progress.init(
Some(index.slot_indices.len()),
git_features::progress::count("pack indices"),
);
let mut statistics = Vec::new();
for slot_index in &index.slot_indices {
let slot = &self.files[*slot_index];
if slot.generation.load(Ordering::SeqCst) != index.generation {
return Err(integrity::Error::NeedsRetryDueToChangeOnDisk);
}
let files = slot.files.load();
let files = Option::as_ref(&files).ok_or(integrity::Error::NeedsRetryDueToChangeOnDisk)?;

match files {
IndexAndPacks::Index(bundle) => {
let index;
let index = match bundle.index.loaded() {
Some(index) => index.deref(),
None => {
index = pack::index::File::at(bundle.index.path(), self.object_hash)?;
&index
}
};
let pack;
let data = match bundle.data.loaded() {
Some(pack) => pack.deref(),
None => {
pack = pack::data::File::at(bundle.data.path(), self.object_hash)?;
&pack
}
};
let outcome = index.verify_integrity(
Some(pack::index::verify::PackContext {
data,
options: options.clone(),
}),
progress.add_child("Checking integrity"),
should_interrupt,
)?;
statistics.push(IndexStatistics {
path: bundle.index.path().to_owned(),
statistics: SingleOrMultiStatistics::Single(
outcome
.pack_traverse_statistics
.expect("pack provided so there are stats"),
),
});
}
IndexAndPacks::MultiIndex(bundle) => {
let index;
let index = match bundle.multi_index.loaded() {
Some(index) => index.deref(),
None => {
index = pack::multi_index::File::at(bundle.multi_index.path())?;
&index
}
};
let outcome = index.verify_integrity(
progress.add_child("Checking integrity"),
should_interrupt,
options.clone(),
)?;

let index_dir = bundle.multi_index.path().parent().expect("file in a directory");
statistics.push(IndexStatistics {
path: Default::default(),
statistics: SingleOrMultiStatistics::Multi(
outcome
.pack_traverse_statistics
.into_iter()
.zip(index.index_names())
.map(|(statistics, index_name)| (index_dir.join(index_name), statistics))
.collect(),
),
});
}
}
progress.inc();
}

progress.init(
Some(index.loose_dbs.len()),
git_features::progress::count("loose object stores"),
);
let mut loose_object_stores = Vec::new();
for loose_db in &*index.loose_dbs {
let out = loose_db
.verify_integrity(
progress.add_child(loose_db.path().display().to_string()),
should_interrupt,
)
.map(|statistics| integrity::LooseObjectStatistics {
path: loose_db.path().to_owned(),
statistics,
})?;
loose_object_stores.push(out);
}

Ok(integrity::Outcome {
loose_object_stores,
index_statistics: statistics,
progress,
})
}
}
2 changes: 2 additions & 0 deletions git-odb/src/store_impls/loose/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ fn hash_path(id: &git_hash::oid, mut root: PathBuf) -> PathBuf {
pub mod find;
///
pub mod iter;
///
pub mod verify;

/// The type for an iterator over `Result<git_hash::ObjectId, Error>)`
pub struct Iter {
Expand Down
86 changes: 86 additions & 0 deletions git-odb/src/store_impls/loose/verify.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use std::{
sync::atomic::{AtomicBool, Ordering},
time::Instant,
};

use git_features::progress::Progress;

use crate::{loose::Store, Write};

///
pub mod integrity {
/// The error returned by [`verify_integrity()`][super::Store::verify_integrity()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("{kind} object {id} could not be decoded")]
ObjectDecode {
source: git_object::decode::Error,
kind: git_object::Kind,
id: git_hash::ObjectId,
},
#[error("{kind} object {expected} wasn't re-encoded without change - new hash is {actual}")]
ObjectHashMismatch {
kind: git_object::Kind,
actual: git_hash::ObjectId,
expected: git_hash::ObjectId,
},
#[error("Objects were deleted during iteration - try again")]
Retry,
#[error("Interrupted")]
Interrupted,
}

/// The outcome returned by [`verify_integrity()`][super::Store::verify_integrity()].
#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)]
#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
pub struct Statistics {
/// The amount of loose objects we checked.
pub num_objects: usize,
}
}

impl Store {
/// Check all loose objects for their integrity checking their hash matches the actual data and by decoding them fully.
pub fn verify_integrity(
&self,
mut progress: impl Progress,
should_interrupt: &AtomicBool,
) -> Result<integrity::Statistics, integrity::Error> {
let mut buf = Vec::new();
let sink = crate::sink(self.object_hash);

let mut num_objects = 0;
let start = Instant::now();
let mut progress = progress.add_child("validating");
progress.init(None, git_features::progress::count("objects"));
for id in self.iter().filter_map(Result::ok) {
let object = self
.try_find(id, &mut buf)
.map_err(|_| integrity::Error::Retry)?
.ok_or(integrity::Error::Retry)?;
let actual_id = sink.write_buf(object.kind, object.data).expect("sink never fails");
if actual_id != id {
return Err(integrity::Error::ObjectHashMismatch {
kind: object.kind,
actual: actual_id,
expected: id,
});
}
object.decode().map_err(|err| integrity::Error::ObjectDecode {
source: err,
kind: object.kind,
id,
})?;

progress.inc();
num_objects += 1;
if should_interrupt.load(Ordering::SeqCst) {
return Err(integrity::Error::Interrupted);
}
}
progress.show_throughput(start);

Ok(integrity::Statistics { num_objects })
}
}
2 changes: 1 addition & 1 deletion git-odb/tests/odb/sink/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use git_odb::Write;

use crate::store::loose::backend::{locate_oid, object_ids};
use crate::store::loose::{locate_oid, object_ids};

#[test]
fn write() -> Result<(), Box<dyn std::error::Error>> {
Expand Down
Loading