Skip to content

Commit

Permalink
Add pre hash check (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
qarmin authored Oct 24, 2020
1 parent 8ecde0f commit d996c3c
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
74 changes: 67 additions & 7 deletions czkawka_core/src/duplicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub enum DeleteMethod {
OneNewest,
}

#[derive(Clone)]
#[derive(Clone, Debug)]
pub struct FileEntry {
pub path: PathBuf,
pub size: u64,
Expand All @@ -51,7 +51,10 @@ pub struct Info {
pub number_of_duplicated_files_by_size: usize,
pub number_of_groups_by_hash: usize,
pub number_of_duplicated_files_by_hash: usize,
pub number_of_duplicated_files_after_pre_hash: usize,
pub number_of_groups_after_pre_hash: usize,
pub lost_space_by_size: u64,
pub lost_space_after_pre_hash: u64,
pub lost_space_by_hash: u64,
pub bytes_read_when_hashing: u64,
pub number_of_removed_files: usize,
Expand Down Expand Up @@ -208,21 +211,21 @@ impl DuplicateFinder {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Cannot read entry in dir {}", current_folder.display()));
continue;
continue 'dir;
} //Permissions denied
};
let metadata: Metadata = match entry_data.metadata() {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Cannot read metadata in dir {}", current_folder.display()));
continue;
continue 'dir;
} //Permissions denied
};
if metadata.is_dir() {
self.information.number_of_checked_folders += 1;

if !self.recursive_search {
continue;
continue 'dir;
}

let next_folder = current_folder.join(entry_data.file_name());
Expand All @@ -239,7 +242,7 @@ impl DuplicateFinder {
// let mut have_valid_extension: bool;
let file_name_lowercase: String = match entry_data.file_name().into_string() {
Ok(t) => t,
Err(_) => continue,
Err(_) => continue 'dir,
}
.to_lowercase();

Expand Down Expand Up @@ -273,7 +276,7 @@ impl DuplicateFinder {
},
Err(_) => {
self.text_messages.warnings.push(format!("Unable to get modification date from file {}", current_file_name.display()));
continue;
continue 'dir;
} // Permissions Denied
},
};
Expand Down Expand Up @@ -318,10 +321,58 @@ impl DuplicateFinder {
let start_time: SystemTime = SystemTime::now();
let mut file_handler: File;
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>>;
let mut pre_checked_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();

// 1 step - check only small part of file hash
for (size, vector) in &self.files_with_identical_size {
hashmap_with_hash = Default::default();

for file_entry in vector {
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
return false;
}
file_handler = match File::open(&file_entry.path) {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Unable to check hash of file {}", file_entry.path.display()));
continue;
}
};

let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 1024 * 2];
let n = match file_handler.read(&mut buffer) {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
continue;
}
};

self.information.bytes_read_when_hashing += n as u64;
hasher.update(&buffer[..n]);

let hash_string: String = hasher.finalize().to_hex().to_string();
hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
}
for (_string, mut vector) in hashmap_with_hash {
if vector.len() > 1 {
pre_checked_map.entry(*size).or_insert_with(Vec::new);
pre_checked_map.get_mut(size).unwrap().append(&mut vector);
}
}
}
for (size, vector) in pre_checked_map.iter() {
self.information.number_of_duplicated_files_after_pre_hash += vector.len() - 1;
self.information.number_of_groups_after_pre_hash += 1;
self.information.lost_space_after_pre_hash += (vector.len() as u64 - 1) * size;
}

// 2 step - Check full file hash
for (size, vector) in &pre_checked_map {
hashmap_with_hash = Default::default();

for file_entry in vector {
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
return false;
Expand All @@ -337,7 +388,7 @@ impl DuplicateFinder {
let mut error_reading_file: bool = false;

let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 16384];
let mut buffer = [0u8; 32 * 1024];
let mut read_bytes: u64 = 0;
loop {
let n = match file_handler.read(&mut buffer) {
Expand Down Expand Up @@ -448,11 +499,20 @@ impl DebugPrint for DuplicateFinder {
"Number of duplicated files by size(in groups) - {} ({})",
self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size
);
println!(
"Number of duplicated files after pre hash(in groups) - {} ({})",
self.information.number_of_duplicated_files_after_pre_hash, self.information.number_of_groups_after_pre_hash
);
println!(
"Number of duplicated files by hash(in groups) - {} ({})",
self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash
);
println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size);
println!(
"Lost space after pre hash - {} ({} bytes)",
self.information.lost_space_after_pre_hash.file_size(options::BINARY).unwrap(),
self.information.lost_space_after_pre_hash
);
println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash);
println!(
"Gained space by removing duplicated entries - {} ({} bytes)",
Expand Down
4 changes: 2 additions & 2 deletions czkawka_gui/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1583,7 +1583,7 @@ fn main() {
for (size, vectors_vector) in btreemap.iter().rev() {
for vector in vectors_vector {
let values: [&dyn ToValue; 6] = [
&(vector.len().to_string() + " x " + size.to_string().as_str()),
&(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
&"".to_string(), // No text in 3 column
&(0), // Not used here
Expand Down Expand Up @@ -1611,7 +1611,7 @@ fn main() {

for (size, vector) in btreemap.iter().rev() {
let values: [&dyn ToValue; 6] = [
&(vector.len().to_string() + " x " + size.to_string().as_str()),
&(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
&"".to_string(), // No text in 3 column
&(0), // Not used here
Expand Down

0 comments on commit d996c3c

Please sign in to comment.