Skip to content

Speedup compress #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 21, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 36 additions & 188 deletions libbz2-rs-sys/src/blocksort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -390,209 +390,57 @@ fn mainGtU(
nblock: u32,
budget: &mut i32,
) -> bool {
let mut k: i32;
let mut c1: u8;
let mut c2: u8;
let mut s1: u16;
let mut s2: u16;

debug_assert_ne!(i1, i2, "mainGtU");

c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
k = nblock.wrapping_add(8 as c_int as c_uint) as i32;
loop {
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);
c1 = block[i1 as usize];
c2 = block[i2 as usize];
let chunk1 = &block[i1 as usize..][..12];
let chunk2 = &block[i2 as usize..][..12];

for (c1, c2) in chunk1.chunks_exact(4).zip(chunk2.chunks_exact(4)) {
let c1 = u32::from_be_bytes(c1[..4].try_into().unwrap());
let c2 = u32::from_be_bytes(c2[..4].try_into().unwrap());

if c1 != c2 {
return c1 > c2;
}
s1 = quadrant[i1 as usize];
s2 = quadrant[i2 as usize];
if s1 != s2 {
return s1 > s2;
}

i1 += 12;
i2 += 12;

for _ in 0..nblock.div_ceil(8) {
let b1 = &block[i1 as usize..][..8];
let b2 = &block[i2 as usize..][..8];

let q1 = &quadrant[i1 as usize..][..8];
let q2 = &quadrant[i2 as usize..][..8];

if b1 != b2 || q1 != q2 {
for (((c1, c2), s1), s2) in b1.iter().zip(b2).zip(q1).zip(q2) {
if c1 != c2 {
return c1 > c2;
}
if s1 != s2 {
return s1 > s2;
}
}
Comment on lines +418 to +425
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't yet found a good way to vectorize this part (tried some stuff with xor and leading_zeros but I could not get it to be correct so far). at least it's out of the hot path, and the equality check will uses full-width loads/compares (even an avx one for the quadrant compare)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just documenting, I came up with this

        let lc1 = u64::from_be_bytes(*b1.first_chunk().unwrap());
        let lc2 = u64::from_be_bytes(*b2.first_chunk().unwrap());

        #[inline(always)]
        fn transform(slice: &[u16]) -> u128 {
            let raw = unsafe { slice.as_ptr().cast::<u128>().read_unaligned().to_be() };
            let mask = 0xFF00ff00_FF00ff00_FF00ff00_FF00ff00u128;

            let upper = raw & mask;
            let lower = raw & !mask;

            (upper >> 8) | (lower << 8)
        }

        if b1 != b2 || q1 != q2 {
            let lq1 = transform(q1);
            let lq2 = transform(q2);

            let first_bad_c = (lc1 ^ lc2).leading_zeros() / 8;
            let first_bad_q = (lq1 ^ lq2).leading_zeros() / 16;

            if first_bad_c <= first_bad_q {
                return lc1 > lc2;
            } else {
                return lq1 > lq2;
            }
        }

which is OK, but for some reason won't use xmm registers, so it's overall just too many instructions to be profitable.

}
i1 = i1.wrapping_add(1);
i2 = i2.wrapping_add(1);

i1 += 8;
i2 += 8;

if i1 >= nblock {
i1 = i1.wrapping_sub(nblock);
}
if i2 >= nblock {
i2 = i2.wrapping_sub(nblock);
}
k -= 8 as c_int;

*budget -= 1;
if k < 0 as c_int {
break false;
}
}

false
}

static INCS: [i32; 14] = [
1 as c_int,
4 as c_int,
Expand Down
Loading