Skip to content

Commit

Permalink
implement rabincdc
Browse files Browse the repository at this point in the history
  • Loading branch information
Piletskii-Oleg committed May 1, 2024
1 parent 6aab0fc commit 82ce9d4
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/bin/filetest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ fn main() {
let (chunks, time) = match cli.algorithm {
Algorithm::Ultra => chunk_file(ultra::Chunker::new(&buf)),
Algorithm::Leap => chunk_file(leap_based::Chunker::new(&buf)),
Algorithm::Rabin => chunk_file(chunking::rabin::RabinChunker::new(&buf))
};

let total_len = chunks.iter().map(|chunk| chunk.len).sum::<usize>();
Expand Down Expand Up @@ -136,4 +137,5 @@ pub struct Input {
pub enum Algorithm {
Ultra,
Leap,
Rabin
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod leap_based;
pub mod ultra;
pub mod rabin;

#[derive(Debug)]
pub struct Chunk {
Expand Down
153 changes: 153 additions & 0 deletions src/rabin.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
use std::cmp::min;
use std::fmt::{Debug};
use crate::Chunk;

// implementation taken from zbox
// https://github.com/zboxfs/zbox

// taken from pcompress implementation
// https://github.com/moinakg/pcompress
const PRIME: u64 = 153_191u64;
const MASK: u64 = 0x00ff_ffff_ffffu64;
const MIN_SIZE: usize = 16 * 1024; // minimal chunk size, 16k
const AVG_SIZE: usize = 32 * 1024; // average chunk size, 32k
const MAX_SIZE: usize = 64 * 1024; // maximum chunk size, 64k

// Irreducible polynomial for Rabin modulus, from pcompress
const FP_POLY: u64 = 0xbfe6_b8a5_bf37_8d83u64;

// since we will skip MIN_SIZE when sliding window, it only
// needs to target (AVG_SIZE - MIN_SIZE) cut length,
// note the (AVG_SIZE - MIN_SIZE) must be 2^n
const CUT_MASK: u64 = (AVG_SIZE - MIN_SIZE - 1) as u64;

// rolling hash window constants
const WIN_SIZE: usize = 16; // must be 2^n
const WIN_MASK: usize = WIN_SIZE - 1;
const WIN_SLIDE_OFFSET: usize = 64;
const WIN_SLIDE_POS: usize = MIN_SIZE - WIN_SLIDE_OFFSET;

pub struct RabinChunker<'a> {
buf: &'a [u8],
params: ChunkerParams, // chunker parameters
pos: usize
}

/// Pre-calculated chunker parameters
#[derive(Clone)]
struct ChunkerParams {
poly_pow: u64, // poly power
out_map: Vec<u64>, // pre-computed out byte map, length is 256
ir: Vec<u64>, // irreducible polynomial, length is 256
}

impl<'a> RabinChunker<'a> {
pub fn new(buf: &'a [u8]) -> RabinChunker {
RabinChunker {
buf,
pos: 0,
params: ChunkerParams::new(),
}
}
}

impl<'a> Iterator for RabinChunker<'a> {
type Item = Chunk;

fn next(&mut self) -> Option<Self::Item> {
let search_range = self.pos..self.buf.len();
if let Some(length) = find_border(&self.buf[search_range], &self.params) {
let chunk = Chunk::new(self.pos, length);

self.pos += length;

Some(chunk)
} else {
None
}
}
}

fn find_border(buf: &[u8], params: &ChunkerParams) -> Option<usize> {
if buf.is_empty() {
return None;
}

let remaining = min(MAX_SIZE, buf.len());
let mut pos = WIN_SLIDE_POS;
let mut chunk_len = WIN_SLIDE_POS;

let mut win = [0u8; WIN_SIZE];
let mut win_idx = 0;
let mut roll_hash = 0;

while pos < remaining {
let ch = buf[pos];
let out = win[win_idx] as usize;
let pushed_out = params.out_map[out];

// calculate Rabin rolling hash
roll_hash = (roll_hash * PRIME) & MASK;
roll_hash += u64::from(ch);
roll_hash = roll_hash.wrapping_sub(pushed_out) & MASK;

// forward circle window
win[win_idx] = ch;
win_idx = (win_idx + 1) & WIN_MASK;

chunk_len += 1;
pos += 1;

if chunk_len >= MIN_SIZE {
let checksum = roll_hash ^ params.ir[out];

if (checksum & CUT_MASK) == 0 || chunk_len >= MAX_SIZE {
return Some(chunk_len);
}
}
}

Some(chunk_len)
}

impl ChunkerParams {
fn new() -> Self {
let mut cp = ChunkerParams::default();

// calculate poly power, it is actually PRIME ^ WIN_SIZE
for _ in 0..WIN_SIZE {
cp.poly_pow = (cp.poly_pow * PRIME) & MASK;
}

// pre-calculate out map table and irreducible polynomial
// for each possible byte, copy from PCompress implementation
for i in 0..256 {
cp.out_map[i] = (i as u64 * cp.poly_pow) & MASK;

let (mut term, mut pow, mut val) = (1u64, 1u64, 1u64);
for _ in 0..WIN_SIZE {
if (term & FP_POLY) != 0 {
val += (pow * i as u64) & MASK;
}
pow = (pow * PRIME) & MASK;
term *= 2;
}
cp.ir[i] = val;
}

cp
}
}

impl Default for ChunkerParams {
fn default() -> Self {
let mut ret = ChunkerParams {
poly_pow: 1,
out_map: vec![0u64; 256],
ir: vec![0u64; 256],
};
ret.out_map.shrink_to_fit();
ret.ir.shrink_to_fit();
ret
}
}

0 comments on commit 82ce9d4

Please sign in to comment.