Skip to content

Commit 3af2d51

Browse files
committed
remove adler32 COPY variant (memcpy is faster)
1 parent 0e95888 commit 3af2d51

File tree

3 files changed

+17
-104
lines changed

3 files changed

+17
-104
lines changed

zlib-rs/src/adler32.rs

+3-7
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,10 @@ pub fn adler32(start_checksum: u32, data: &[u8]) -> u32 {
2323
pub fn adler32_fold_copy(start_checksum: u32, dst: &mut [MaybeUninit<u8>], src: &[u8]) -> u32 {
2424
debug_assert!(dst.len() >= src.len(), "{} < {}", dst.len(), src.len());
2525

26-
#[cfg(target_arch = "x86_64")]
27-
if crate::cpu_features::is_enabled_avx2() {
28-
return avx2::adler32_fold_copy_avx2(start_checksum, dst, src);
29-
}
30-
31-
let adler = adler32(start_checksum, src);
26+
// integrating the memcpy into the adler32 function did not have any benefits, and in fact was
27+
// a bit slower for very small chunk sizes.
3228
dst[..src.len()].copy_from_slice(slice_to_uninit(src));
33-
adler
29+
adler32(start_checksum, src)
3430
}
3531

3632
pub fn adler32_combine(adler1: u32, adler2: u32, len2: u64) -> u32 {

zlib-rs/src/adler32/avx2.rs

+14-75
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
1-
use core::{
2-
arch::x86_64::{
3-
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256,
4-
_mm256_madd_epi16, _mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8,
5-
_mm256_slli_epi32, _mm256_storeu_si256, _mm256_zextsi128_si256, _mm_add_epi32,
6-
_mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32, _mm_unpackhi_epi64,
7-
},
8-
mem::MaybeUninit,
1+
use core::arch::x86_64::{
2+
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256, _mm256_madd_epi16,
3+
_mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8, _mm256_slli_epi32,
4+
_mm256_zextsi128_si256, _mm_add_epi32, _mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32,
5+
_mm_unpackhi_epi64,
96
};
107

118
use crate::adler32::{
12-
generic::{adler32_copy_len_16, adler32_len_16, adler32_len_64},
9+
generic::{adler32_len_16, adler32_len_64},
1310
BASE, NMAX,
1411
};
1512

@@ -63,20 +60,11 @@ unsafe fn partial_hsum256(x: __m256i) -> u32 {
6360

6461
pub fn adler32_avx2(adler: u32, src: &[u8]) -> u32 {
6562
assert!(crate::cpu_features::is_enabled_avx2());
66-
unsafe { adler32_avx2_help::<false>(adler, &mut [], src) }
67-
}
68-
69-
pub fn adler32_fold_copy_avx2(adler: u32, dst: &mut [MaybeUninit<u8>], src: &[u8]) -> u32 {
70-
assert!(crate::cpu_features::is_enabled_avx2());
71-
unsafe { adler32_avx2_help::<true>(adler, dst, src) }
63+
unsafe { adler32_avx2_help(adler, src) }
7264
}
7365

7466
#[target_feature(enable = "avx2")]
75-
unsafe fn adler32_avx2_help<const COPY: bool>(
76-
adler: u32,
77-
mut dst: &mut [MaybeUninit<u8>],
78-
src: &[u8],
79-
) -> u32 {
67+
unsafe fn adler32_avx2_help(adler: u32, src: &[u8]) -> u32 {
8068
if src.is_empty() {
8169
return adler;
8270
}
@@ -87,21 +75,9 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
8775
let mut adler0 = adler & 0xffff;
8876

8977
let adler = if before.len() < 16 {
90-
if COPY {
91-
let adler = adler32_copy_len_16(adler0, dst, before, adler1);
92-
dst = &mut dst[before.len()..];
93-
adler
94-
} else {
95-
adler32_len_16(adler0, before, adler1)
96-
}
78+
adler32_len_16(adler0, before, adler1)
9779
} else if before.len() < 32 {
98-
if COPY {
99-
let adler = adler32_copy_len_16(adler0, dst, before, adler1);
100-
dst = &mut dst[before.len()..];
101-
adler
102-
} else {
103-
adler32_len_64(adler0, before, adler1)
104-
}
80+
adler32_len_64(adler0, before, adler1)
10581
} else {
10682
adler
10783
};
@@ -111,25 +87,14 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
11187

11288
// use largest step possible (without causing overflow)
11389
for chunk in middle.chunks(NMAX as usize / 32) {
114-
(adler0, adler1) = unsafe { helper_32_bytes::<COPY>(adler0, adler1, dst, chunk) };
115-
if COPY {
116-
dst = &mut dst[32 * chunk.len()..];
117-
}
90+
(adler0, adler1) = unsafe { helper_32_bytes(adler0, adler1, chunk) };
11891
}
11992

12093
if !after.is_empty() {
12194
if after.len() < 16 {
122-
if COPY {
123-
return adler32_copy_len_16(adler0, dst, after, adler1);
124-
} else {
125-
return adler32_len_16(adler0, after, adler1);
126-
}
95+
return adler32_len_16(adler0, after, adler1);
12796
} else if after.len() < 32 {
128-
if COPY {
129-
return adler32_copy_len_16(adler0, dst, after, adler1);
130-
} else {
131-
return adler32_len_64(adler0, after, adler1);
132-
}
97+
return adler32_len_64(adler0, after, adler1);
13398
} else {
13499
unreachable!()
135100
}
@@ -139,26 +104,14 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
139104
}
140105

141106
#[target_feature(enable = "avx2")]
142-
unsafe fn helper_32_bytes<const COPY: bool>(
143-
mut adler0: u32,
144-
mut adler1: u32,
145-
dst: &mut [MaybeUninit<u8>],
146-
src: &[__m256i],
147-
) -> (u32, u32) {
107+
unsafe fn helper_32_bytes(mut adler0: u32, mut adler1: u32, src: &[__m256i]) -> (u32, u32) {
148108
let mut vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0 as i32));
149109
let mut vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1 as i32));
150110

151111
let mut vs1_0 = vs1;
152112
let mut vs3 = ZERO;
153113

154-
let mut out_chunks = dst.chunks_exact_mut(32);
155-
156114
for vbuf in src.iter().copied() {
157-
if COPY {
158-
let out_chunk = out_chunks.next().unwrap();
159-
_mm256_storeu_si256(out_chunk.as_mut_ptr() as *mut __m256i, vbuf);
160-
}
161-
162115
let vs1_sad = _mm256_sad_epu8(vbuf, ZERO); // Sum of abs diff, resulting in 2 x int32's
163116

164117
vs1 = _mm256_add_epi32(vs1, vs1_sad);
@@ -240,18 +193,4 @@ mod test {
240193
unsafe fn slice_assume_init(slice: &[MaybeUninit<u8>]) -> &[u8] {
241194
&*(slice as *const [MaybeUninit<u8>] as *const [u8])
242195
}
243-
244-
#[test]
245-
fn fold_copy_copies() {
246-
let src: Vec<_> = (0..128).map(|x| x as u8).collect();
247-
let mut dst = [MaybeUninit::new(0); 128];
248-
249-
for (i, _) in src.iter().enumerate() {
250-
dst.fill(MaybeUninit::new(0));
251-
252-
adler32_fold_copy_avx2(1, &mut dst[..i], &src[..i]);
253-
254-
assert_eq!(&src[..i], unsafe { slice_assume_init(&dst[..i]) })
255-
}
256-
}
257196
}

zlib-rs/src/adler32/generic.rs

-22
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
use core::mem::MaybeUninit;
2-
31
use super::{BASE, NMAX};
42

53
const UNROLL_MORE: bool = true;
@@ -100,26 +98,6 @@ pub(crate) fn adler32_len_16(mut adler: u32, buf: &[u8], mut sum2: u32) -> u32 {
10098
adler | (sum2 << 16)
10199
}
102100

103-
#[cfg_attr(not(target_arch = "x86_64"), allow(unused))]
104-
pub(crate) fn adler32_copy_len_16(
105-
mut adler: u32,
106-
dst: &mut [MaybeUninit<u8>],
107-
src: &[u8],
108-
mut sum2: u32,
109-
) -> u32 {
110-
for (source, destination) in src.iter().zip(dst.iter_mut()) {
111-
let v = *source;
112-
*destination = MaybeUninit::new(v);
113-
adler += v as u32;
114-
sum2 += adler;
115-
}
116-
117-
adler %= BASE;
118-
sum2 %= BASE; /* only added so many BASE's */
119-
/* return recombined sums */
120-
adler | (sum2 << 16)
121-
}
122-
123101
pub(crate) fn adler32_len_64(mut adler: u32, buf: &[u8], mut sum2: u32) -> u32 {
124102
const N: usize = if UNROLL_MORE { 16 } else { 8 };
125103
let mut it = buf.chunks_exact(N);

0 commit comments

Comments
 (0)