Skip to content

Fix comparison of changed files #29

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,20 @@ include = ["src/**/*", "LICENSE.md", "README.md", "CHANGELOG.md"]
[lib]
test = false

[[test]]
name = "baseline"
path = "tests/baseline.rs"
required-features = ["max-performance"]

[[test]]
name = "baseline-atomic"
path = "tests/baseline_atomic.rs.rs"
required-features = ["max-performance"]

[features]
default = []
max-performance = ["git-repository/max-performance"]

[dependencies]
git-repository = { version = "0.28.0", default-features = false, features = ["max-performance-safe", "blocking-network-client", "blocking-http-transport"] }
serde = { version = "1", features = ["std", "derive"] }
Expand All @@ -22,6 +36,8 @@ smartstring = "1.0.1"
serde_json = "1"
bstr = "1.0.1"
thiserror = "1.0.32"
ahash = "0.8.0"
hashbrown = { version = "0.13.1", features = ["raw"] }

[dev-dependencies]
git-testtools = "0.9.0"
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ CARGO = $(shell command -v cargo)

##@ Development

baseline-atomic: ## run very slow tests that single-step through all commits
GITOXIDE_PACK_CACHE_MEMORY=1g GITOXIDE_OBJECT_CACHE_MEMORY=3g RUST_BACKTRACE=1 cargo test --test baseline_atomic --release --features max-performance -- --nocapture

test: ## run all tests with cargo
RUST_BACKTRACE=1 cargo test --test crates-index-diff
RUST_BACKTRACE=1 cargo test --test baseline --release
GITOXIDE_PACK_CACHE_MEMORY=1g RUST_BACKTRACE=1 cargo test --test baseline --release --features max-performance

172 changes: 69 additions & 103 deletions src/index/diff/delegate.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use crate::index::diff::Error;
use crate::{Change, CrateVersion};
use ahash::{AHashSet, RandomState};
use bstr::BStr;
use git_repository as git;
use std::collections::BTreeSet;
use std::ops::Range;
use hashbrown::raw::RawTable;

#[derive(Default)]
pub(crate) struct Delegate {
changes: Vec<Change>,
deleted_version_ids: BTreeSet<u64>,
err: Option<Error>,
}

Expand All @@ -32,16 +31,18 @@ impl Delegate {
if change.location.contains(&b'.') {
return Ok(Default::default());
}

match change.event {
Addition { entry_mode, id } => {
if let Some(obj) = entry_data(entry_mode, id)? {
for line in obj.data.lines() {
let version = version_from_json_line(line, change.location)?;
self.changes.push(if version.yanked {
Change::Yanked(version)
let change = if version.yanked {
Change::AddedAndYanked(version)
} else {
Change::Added(version)
});
};
self.changes.push(change)
}
}
}
Expand All @@ -60,115 +61,80 @@ impl Delegate {
}
Modification { .. } => {
if let Some(diff) = change.event.diff().transpose()? {
let mut old_lines = AHashSet::with_capacity(1024);
let location = change.location;
for line in diff.old.data.lines() {
// Safety: We transform an &'_ [u8] to and &'static [u8] here
// this is safe because we always drain the hashmap at the end of the function
// the reason the HashMap has a static is that we want to reuse
// the allocation for modifications
old_lines.insert(line);
}

let input = diff.line_tokens();
let mut err = None;
git::diff::blob::diff(
diff.algo,
&input,
|before: Range<u32>, after: Range<u32>| {
if err.is_some() {
return;
}
let mut lines_before = input.before
[before.start as usize..before.end as usize]
.iter()
.map(|&line| input.interner[line].as_bstr())
.peekable();
let mut lines_after = input.after
[after.start as usize..after.end as usize]
.iter()
.map(|&line| input.interner[line].as_bstr())
.peekable();
let mut remember = |version: CrateVersion| {
self.changes.push(if version.yanked {
Change::Yanked(version)
} else {
Change::Added(version)
});
};
'outer: loop {
match (lines_before.peek().is_some(), lines_after.peek().is_some())
{
(true, false) => {
for removed in lines_before {
match version_from_json_line(removed, location) {
Ok(version) => {
self.deleted_version_ids.insert(version.id());
}
Err(e) => {
err = Some(e);
break;
}
}
}
break 'outer;
}
(false, true) => {
for inserted in lines_after {
match version_from_json_line(inserted, location) {
Ok(version) => remember(version),
Err(e) => {
err = Some(e);
break;
}
}
}
break 'outer;
}
(true, true) => {
for (removed, inserted) in
lines_before.by_ref().zip(lines_after.by_ref())
{
match version_from_json_line(inserted, location)
.and_then(|inserted| {
version_from_json_line(removed, location)
.map(|removed| (removed, inserted))
}) {
Ok((removed_version, inserted_version)) => {
if removed_version.yanked
!= inserted_version.yanked
{
remember(inserted_version);
}
}
Err(e) => {
err = Some(e);
break;
}
}
}
}
(false, false) => break 'outer,
}
// A RawTable is used to represent a Checksum -> CrateVersion map
// because the checksum is already stored in the CrateVersion
// and we want to avoid storing the checksum twice for performance reasons
let mut new_versions = RawTable::with_capacity(old_lines.len().min(1024));
let hasher = RandomState::new();

for line in diff.new.data.lines() {
// first quickly check if the exact same line is already present in this file in that case we don't need to do anything else
if old_lines.remove(line) {
continue;
}
// no need to check if the checksum already exists in the hashmap
// as each checksum appear only once
let new_version = version_from_json_line(line, location)?;
new_versions.insert(
hasher.hash_one(new_version.checksum),
new_version,
|rehashed| hasher.hash_one(rehashed.checksum),
);
}

let mut deleted = Vec::new();
for line in old_lines.drain() {
let old_version = version_from_json_line(line, location)?;
let new_version = new_versions
.remove_entry(hasher.hash_one(old_version.checksum), |version| {
version.checksum == old_version.checksum
});
match new_version {
Some(new_version) => {
let change = match (old_version.yanked, new_version.yanked) {
(true, false) => Change::Unyanked(new_version),
(false, true) => Change::Yanked(new_version),
_ => continue,
};
self.changes.push(change)
}
},
);
if let Some(err) = err {
return Err(err);
None => deleted.push(old_version),
}
}
if !deleted.is_empty() {
self.changes.push(Change::Deleted {
name: deleted[0].name.to_string(),
versions: deleted,
})
}
for version in new_versions.drain() {
let change = if version.yanked {
Change::AddedAndYanked(version)
} else {
Change::Added(version)
};
self.changes.push(change);
}
}
}
}
Ok(Default::default())
}

pub fn into_result(mut self) -> Result<Vec<Change>, Error> {
pub fn into_result(self) -> Result<Vec<Change>, Error> {
match self.err {
Some(err) => Err(err),
None => {
if !self.deleted_version_ids.is_empty() {
let deleted_version_ids = &self.deleted_version_ids;
self.changes.retain(|change| match change {
Change::Added(v) | Change::Yanked(v) => {
!deleted_version_ids.contains(&v.id())
}
Change::Deleted { .. } => true,
})
}
Ok(self.changes)
}
None => Ok(self.changes),
}
}
}
Expand Down
20 changes: 20 additions & 0 deletions src/index/diff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,16 @@ impl Index {

/// Similar to `changes()`, but requires `from` and `to` objects to be provided. They may point
/// to either `Commit`s or `Tree`s.
///
/// # Returns
///
/// A list of atomic chanes that were performed on the index
/// between the two revisions.
/// The changes are grouped by the crate they belong to.
/// The order of the changes for each crate are **non-deterministic**.
/// The order of crates is also **non-deterministic**.
///
/// If a specific order is required, the changes must be sorted by the caller.
pub fn changes_between_commits(
&self,
from: impl Into<git::hash::ObjectId>,
Expand Down Expand Up @@ -237,6 +247,16 @@ impl Index {
/// Learn more about specifying revisions
/// in the
/// [official documentation](https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html)
///
/// # Returns
///
/// A list of atomic chanes that were performed on the index
/// between the two revisions.
/// The changes are grouped by the crate they belong to.
/// The order of the changes for each crate are **non-deterministic**.
/// The order of crates is also **non-deterministic**.
///
/// If a specific order is required, the changes must be sorted by the caller.
pub fn changes(
&self,
from: impl AsRef<str>,
Expand Down
53 changes: 37 additions & 16 deletions src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use std::collections::HashMap;

use git_repository as git;
use smartstring::alias::String as SmolString;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::hash::Hash;
use std::{fmt, slice};

/// A wrapper for a repository of the crates.io index.
pub struct Index {
Expand All @@ -22,8 +22,15 @@ pub struct Index {
/// Identify a kind of change that occurred to a crate
#[derive(Clone, Eq, PartialEq, Debug)]
pub enum Change {
/// A crate version was added or it was unyanked.
/// A crate version was added.
Added(CrateVersion),
/// A crate version was unyanked.
Unyanked(CrateVersion),
/// A crate version was added in a yanked state.
///
/// This can happen if we don't see the commit that added them, so it appears to pop into existence yanked.
/// Knowing this should help to trigger the correct action, as simply `Yanked` crates would be treated quite differently.
AddedAndYanked(CrateVersion),
/// A crate version was yanked.
Yanked(CrateVersion),
/// The name of the crate whose file was deleted, which implies all versions were deleted as well.
Expand All @@ -39,15 +46,23 @@ impl Change {
/// Return the added crate, if this is this kind of change.
pub fn added(&self) -> Option<&CrateVersion> {
match self {
Change::Added(v) => Some(v),
Change::Added(v) | Change::AddedAndYanked(v) => Some(v),
_ => None,
}
}

/// Return the yanked crate, if this is this kind of change.
pub fn yanked(&self) -> Option<&CrateVersion> {
match self {
Change::Yanked(v) => Some(v),
Change::Yanked(v) | Change::AddedAndYanked(v) => Some(v),
_ => None,
}
}

/// Return the unyanked crate, if this is this kind of change.
pub fn unyanked(&self) -> Option<&CrateVersion> {
match self {
Change::Unyanked(v) => Some(v),
_ => None,
}
}
Expand All @@ -59,6 +74,21 @@ impl Change {
_ => None,
}
}

/// Returns all versions affected by this change.
///
/// The returned slice usually has length 1.
/// However, if a crate was purged from the index by an admin,
/// all versions of the purged crate are returned.
pub fn versions(&self) -> &[CrateVersion] {
match self {
Change::Added(v)
| Change::Unyanked(v)
| Change::AddedAndYanked(v)
| Change::Yanked(v) => slice::from_ref(v),
Change::Deleted { versions, .. } => versions,
}
}
}

impl fmt::Display for Change {
Expand All @@ -70,6 +100,8 @@ impl fmt::Display for Change {
Change::Added(_) => "added",
Change::Yanked(_) => "yanked",
Change::Deleted { .. } => "deleted",
Change::Unyanked(_) => "unyanked",
Change::AddedAndYanked(_) => "added and yanked",
}
)
}
Expand Down Expand Up @@ -109,17 +141,6 @@ pub struct CrateVersion {
pub dependencies: Vec<Dependency>,
}

impl CrateVersion {
pub(crate) fn id(&self) -> u64 {
let mut s = std::collections::hash_map::DefaultHasher::new();
self.name.hash(&mut s);
self.yanked.hash(&mut s);
self.version.hash(&mut s);
self.checksum.hash(&mut s);
s.finish()
}
}

/// A single dependency of a specific crate version
#[derive(
Clone, serde::Serialize, serde::Deserialize, Ord, PartialOrd, Eq, PartialEq, Debug, Hash,
Expand Down
Loading