@@ -42,7 +42,9 @@ class float16_t {
4242 operator float () const ;
4343
4444 static void cvt_float_to_float16 (const float *src, float16_t *dst, int size);
45+ static void cvt_float_to_float16_MT (const float *src, float16_t *dst, int size);
4546 static void cvt_float16_to_float (const float16_t *src, float *dst, int size);
47+ static void cvt_float16_to_float_MT (const float16_t *src, float *dst, int size);
4648 static void float_add_float16 (const float *src1, const float16_t *src2, float *dst, int size);
4749
4850private:
@@ -150,6 +152,36 @@ inline void float16_t::cvt_float_to_float16(const float *src, float16_t *dst, in
150152 }
151153}
152154
155+ inline void float16_t::cvt_float_to_float16_MT (const float *src, float16_t *dst, int size) {
156+ // Round to nearest even mode
157+ constexpr int rounding_mode = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
158+
159+ // Process 16 floats (AVX512 is a 512-bit SIMD register)
160+ constexpr int kStep = 16 ;
161+ int blockSize = size / kStep ;
162+ int remainder = size % kStep ;
163+
164+ // Process blocks of 16 floats at a time
165+ #pragma omp parallel for
166+ for (int i = 0 ; i < blockSize; ++i) {
167+ // Load the input floats into a AVX512 register
168+ __m512 input_vector = _mm512_loadu_ps (src + i * kStep );
169+
170+ // Convert the floats to float16_t using AVX512 intrinsics
171+ __m256i output_vector = _mm512_cvtps_ph (input_vector, rounding_mode);
172+
173+ // Store the converted values in the output array
174+ _mm256_mask_storeu_epi16 (dst + i * kStep , 0xffff , output_vector);
175+ }
176+
177+ if (remainder != 0 ) {
178+ __mmask16 mask = 0xFFFF >> (kStep - remainder);
179+ __m512 input_vector = _mm512_maskz_loadu_ps (mask, src + size - remainder);
180+ __m256i output_vector = _mm512_cvtps_ph (input_vector, rounding_mode);
181+ _mm256_mask_storeu_epi16 (dst + size - remainder, mask, output_vector);
182+ }
183+ }
184+
153185inline void float16_t::cvt_float16_to_float (const float16_t *src, float *dst, int size) {
154186 // Process 16 floats (AVX512 is a 512-bit SIMD register)
155187 constexpr int kStep = 16 ;
@@ -170,6 +202,27 @@ inline void float16_t::cvt_float16_to_float(const float16_t *src, float *dst, in
170202 }
171203}
172204
205+ inline void float16_t::cvt_float16_to_float_MT (const float16_t *src, float *dst, int size) {
206+ // Process 16 floats (AVX512 is a 512-bit SIMD register)
207+ constexpr int kStep = 16 ;
208+ int blockSize = size / kStep ;
209+ int remainder = size % kStep ;
210+
211+ #pragma omp parallel for
212+ for (int i = 0 ; i < blockSize; ++i) {
213+ __m256i input_vector = _mm256_maskz_loadu_epi16 (0xffff , src + i * kStep );
214+ __m512 output_vector = _mm512_cvtph_ps (input_vector);
215+ _mm512_storeu_ps (dst + i * kStep , output_vector);
216+ }
217+
218+ if (remainder != 0 ) {
219+ __mmask16 mask = 0xFFFF >> (kStep - remainder);
220+ __m256i input_vector = _mm256_maskz_loadu_epi16 (mask, src + size - remainder);
221+ __m512 output_vector = _mm512_cvtph_ps (input_vector);
222+ _mm512_mask_storeu_ps (dst + size - remainder, mask, output_vector);
223+ }
224+ }
225+
173226inline void float16_t::float_add_float16 (const float *src1, const float16_t *src2, float *dst, int size) {
174227 constexpr int kStep = 16 ;
175228 int blockSize = size / kStep ;
0 commit comments