1
- use core:: {
2
- arch:: x86_64:: {
3
- __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256,
4
- _mm256_madd_epi16, _mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8,
5
- _mm256_slli_epi32, _mm256_storeu_si256, _mm256_zextsi128_si256, _mm_add_epi32,
6
- _mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32, _mm_unpackhi_epi64,
7
- } ,
8
- mem:: MaybeUninit ,
1
+ use core:: arch:: x86_64:: {
2
+ __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256, _mm256_madd_epi16,
3
+ _mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8, _mm256_slli_epi32,
4
+ _mm256_zextsi128_si256, _mm_add_epi32, _mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32,
5
+ _mm_unpackhi_epi64,
9
6
} ;
10
7
11
8
use crate :: adler32:: {
12
- generic:: { adler32_copy_len_16 , adler32_len_16, adler32_len_64} ,
9
+ generic:: { adler32_len_16, adler32_len_64} ,
13
10
BASE , NMAX ,
14
11
} ;
15
12
@@ -63,20 +60,11 @@ unsafe fn partial_hsum256(x: __m256i) -> u32 {
63
60
64
61
pub fn adler32_avx2 ( adler : u32 , src : & [ u8 ] ) -> u32 {
65
62
assert ! ( crate :: cpu_features:: is_enabled_avx2( ) ) ;
66
- unsafe { adler32_avx2_help :: < false > ( adler, & mut [ ] , src) }
67
- }
68
-
69
- pub fn adler32_fold_copy_avx2 ( adler : u32 , dst : & mut [ MaybeUninit < u8 > ] , src : & [ u8 ] ) -> u32 {
70
- assert ! ( crate :: cpu_features:: is_enabled_avx2( ) ) ;
71
- unsafe { adler32_avx2_help :: < true > ( adler, dst, src) }
63
+ unsafe { adler32_avx2_help ( adler, src) }
72
64
}
73
65
74
66
#[ target_feature( enable = "avx2" ) ]
75
- unsafe fn adler32_avx2_help < const COPY : bool > (
76
- adler : u32 ,
77
- mut dst : & mut [ MaybeUninit < u8 > ] ,
78
- src : & [ u8 ] ,
79
- ) -> u32 {
67
+ unsafe fn adler32_avx2_help ( adler : u32 , src : & [ u8 ] ) -> u32 {
80
68
if src. is_empty ( ) {
81
69
return adler;
82
70
}
@@ -87,21 +75,9 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
87
75
let mut adler0 = adler & 0xffff ;
88
76
89
77
let adler = if before. len ( ) < 16 {
90
- if COPY {
91
- let adler = adler32_copy_len_16 ( adler0, dst, before, adler1) ;
92
- dst = & mut dst[ before. len ( ) ..] ;
93
- adler
94
- } else {
95
- adler32_len_16 ( adler0, before, adler1)
96
- }
78
+ adler32_len_16 ( adler0, before, adler1)
97
79
} else if before. len ( ) < 32 {
98
- if COPY {
99
- let adler = adler32_copy_len_16 ( adler0, dst, before, adler1) ;
100
- dst = & mut dst[ before. len ( ) ..] ;
101
- adler
102
- } else {
103
- adler32_len_64 ( adler0, before, adler1)
104
- }
80
+ adler32_len_64 ( adler0, before, adler1)
105
81
} else {
106
82
adler
107
83
} ;
@@ -111,25 +87,14 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
111
87
112
88
// use largest step possible (without causing overflow)
113
89
for chunk in middle. chunks ( NMAX as usize / 32 ) {
114
- ( adler0, adler1) = unsafe { helper_32_bytes :: < COPY > ( adler0, adler1, dst, chunk) } ;
115
- if COPY {
116
- dst = & mut dst[ 32 * chunk. len ( ) ..] ;
117
- }
90
+ ( adler0, adler1) = unsafe { helper_32_bytes ( adler0, adler1, chunk) } ;
118
91
}
119
92
120
93
if !after. is_empty ( ) {
121
94
if after. len ( ) < 16 {
122
- if COPY {
123
- return adler32_copy_len_16 ( adler0, dst, after, adler1) ;
124
- } else {
125
- return adler32_len_16 ( adler0, after, adler1) ;
126
- }
95
+ return adler32_len_16 ( adler0, after, adler1) ;
127
96
} else if after. len ( ) < 32 {
128
- if COPY {
129
- return adler32_copy_len_16 ( adler0, dst, after, adler1) ;
130
- } else {
131
- return adler32_len_64 ( adler0, after, adler1) ;
132
- }
97
+ return adler32_len_64 ( adler0, after, adler1) ;
133
98
} else {
134
99
unreachable ! ( )
135
100
}
@@ -139,26 +104,14 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
139
104
}
140
105
141
106
#[ target_feature( enable = "avx2" ) ]
142
- unsafe fn helper_32_bytes < const COPY : bool > (
143
- mut adler0 : u32 ,
144
- mut adler1 : u32 ,
145
- dst : & mut [ MaybeUninit < u8 > ] ,
146
- src : & [ __m256i ] ,
147
- ) -> ( u32 , u32 ) {
107
+ unsafe fn helper_32_bytes ( mut adler0 : u32 , mut adler1 : u32 , src : & [ __m256i ] ) -> ( u32 , u32 ) {
148
108
let mut vs1 = _mm256_zextsi128_si256 ( _mm_cvtsi32_si128 ( adler0 as i32 ) ) ;
149
109
let mut vs2 = _mm256_zextsi128_si256 ( _mm_cvtsi32_si128 ( adler1 as i32 ) ) ;
150
110
151
111
let mut vs1_0 = vs1;
152
112
let mut vs3 = ZERO ;
153
113
154
- let mut out_chunks = dst. chunks_exact_mut ( 32 ) ;
155
-
156
114
for vbuf in src. iter ( ) . copied ( ) {
157
- if COPY {
158
- let out_chunk = out_chunks. next ( ) . unwrap ( ) ;
159
- _mm256_storeu_si256 ( out_chunk. as_mut_ptr ( ) as * mut __m256i , vbuf) ;
160
- }
161
-
162
115
let vs1_sad = _mm256_sad_epu8 ( vbuf, ZERO ) ; // Sum of abs diff, resulting in 2 x int32's
163
116
164
117
vs1 = _mm256_add_epi32 ( vs1, vs1_sad) ;
@@ -240,18 +193,4 @@ mod test {
240
193
unsafe fn slice_assume_init ( slice : & [ MaybeUninit < u8 > ] ) -> & [ u8 ] {
241
194
& * ( slice as * const [ MaybeUninit < u8 > ] as * const [ u8 ] )
242
195
}
243
-
244
- #[ test]
245
- fn fold_copy_copies ( ) {
246
- let src: Vec < _ > = ( 0 ..128 ) . map ( |x| x as u8 ) . collect ( ) ;
247
- let mut dst = [ MaybeUninit :: new ( 0 ) ; 128 ] ;
248
-
249
- for ( i, _) in src. iter ( ) . enumerate ( ) {
250
- dst. fill ( MaybeUninit :: new ( 0 ) ) ;
251
-
252
- adler32_fold_copy_avx2 ( 1 , & mut dst[ ..i] , & src[ ..i] ) ;
253
-
254
- assert_eq ! ( & src[ ..i] , unsafe { slice_assume_init( & dst[ ..i] ) } )
255
- }
256
- }
257
196
}
0 commit comments