GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_s32f_x2_convert_8u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 277 277 100.0%
Functions: 10 10 100.0%
Branches: 42 42 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2023 Daniel Estevez <daniel@destevez.net>
4 * Copyright 2012, 2014 Free Software Foundation, Inc.
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11 /*!
12 * \page volk_32f_s32f_x2_convert_8u
13 *
14 * \b Overview
15 *
16 * Converts a floating point number to an 8-bit unsigned int after applying a
17 * multiplicative scaling factor and an additive bias.
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_s32f_x2_convert_8u(uint8_t* outputVector, const float* inputVector,
22 const float scale, const float bias, unsigned int num_points)
23 * \endcode
24 *
25 * \b Inputs
26 * \li inputVector: the input vector of floats.
27 * \li scale: The value multiplied against each point in the input buffer.
28 * \li bias: The value added to each multiplication by the scale.
29 * \li num_points: The number of data points.
30 *
31 * \b Outputs
32 * \li outputVector: The output vector.
33 *
34 * \b Example
35 * Convert floats from [-1,1] to 8-bit unsigend integers with a scale of 128 and a bias of
36 128
37 * int N = 10;
38 * unsigned int alignment = volk_get_alignment();
39 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
40 * uint8_t* out = (uint8_t*)volk_malloc(sizeof(uint8_t)*N, alignment);
41 *
42 * for(unsigned int ii = 0; ii < N; ++ii){
43 * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
44 * }
45 *
46 * float scale = 128.0f;
47 * float bias = 128.0f;
48 *
49 * volk_32f_s32f_x2_convert_8u(out, increasing, scale, bias, N);
50 *
51 * for(unsigned int ii = 0; ii < N; ++ii){
52 * printf("out[%u] = %i\n", ii, out[ii]);
53 * }
54 *
55 * volk_free(increasing);
56 * volk_free(out);
57 * \endcode
58 */
59
60 #ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
61 #define INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
62
63 #include <inttypes.h>
64
65 262462 static inline void volk_32f_s32f_x2_convert_8u_single(uint8_t* out, const float in)
66 {
67 262462 const float min_val = 0.0f;
68 262462 const float max_val = UINT8_MAX;
69
2/2
✓ Branch 0 taken 80080 times.
✓ Branch 1 taken 182382 times.
262462 if (in > max_val) {
70 80080 *out = (uint8_t)(max_val);
71
2/2
✓ Branch 0 taken 79991 times.
✓ Branch 1 taken 102391 times.
182382 } else if (in < min_val) {
72 79991 *out = (uint8_t)(min_val);
73 } else {
74 102391 *out = (uint8_t)(rintf(in));
75 }
76 262462 }
77
78
79 #ifdef LV_HAVE_GENERIC
80
81 2 static inline void volk_32f_s32f_x2_convert_8u_generic(uint8_t* outputVector,
82 const float* inputVector,
83 const float scale,
84 const float bias,
85 unsigned int num_points)
86 {
87 2 const float* inputVectorPtr = inputVector;
88
89
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (unsigned int number = 0; number < num_points; number++) {
90 262142 const float r = *inputVectorPtr++ * scale + bias;
91 262142 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
92 }
93 2 }
94
95 #endif /* LV_HAVE_GENERIC */
96
97
98 #if LV_HAVE_AVX2 && LV_HAVE_FMA
99 #include <immintrin.h>
100
101 2 static inline void volk_32f_s32f_x2_convert_8u_u_avx2_fma(uint8_t* outputVector,
102 const float* inputVector,
103 const float scale,
104 const float bias,
105 unsigned int num_points)
106 {
107 2 const unsigned int thirtysecondPoints = num_points / 32;
108
109 2 const float* inputVectorPtr = (const float*)inputVector;
110 2 uint8_t* outputVectorPtr = outputVector;
111
112 2 const float min_val = 0.0f;
113 2 const float max_val = UINT8_MAX;
114 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
115 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
116
117 2 const __m256 vScale = _mm256_set1_ps(scale);
118 2 const __m256 vBias = _mm256_set1_ps(bias);
119
120
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
121 8190 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
122 8190 inputVectorPtr += 8;
123 8190 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
124 8190 inputVectorPtr += 8;
125 8190 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
126 8190 inputVectorPtr += 8;
127 8190 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
128 8190 inputVectorPtr += 8;
129
130 24570 inputVal1 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
132 24570 inputVal2 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
134 24570 inputVal3 = _mm256_max_ps(
135 _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
136 24570 inputVal4 = _mm256_max_ps(
137 _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
138
139 8190 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
140 8190 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
141 8190 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
142 8190 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
143
144 8190 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
145 8190 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
146 8190 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
147 8190 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
148
149 8190 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
150 8190 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
151
152 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
153 8190 outputVectorPtr += 32;
154 }
155
156
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
157 62 const float r = inputVector[number] * scale + bias;
158 62 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
159 }
160 2 }
161
162 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
163
164
165 #ifdef LV_HAVE_AVX2
166 #include <immintrin.h>
167
168 2 static inline void volk_32f_s32f_x2_convert_8u_u_avx2(uint8_t* outputVector,
169 const float* inputVector,
170 const float scale,
171 const float bias,
172 unsigned int num_points)
173 {
174 2 const unsigned int thirtysecondPoints = num_points / 32;
175
176 2 const float* inputVectorPtr = (const float*)inputVector;
177 2 uint8_t* outputVectorPtr = outputVector;
178
179 2 const float min_val = 0.0f;
180 2 const float max_val = UINT8_MAX;
181 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
182 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
183
184 2 const __m256 vScale = _mm256_set1_ps(scale);
185 2 const __m256 vBias = _mm256_set1_ps(bias);
186
187
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
188 8190 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
189 8190 inputVectorPtr += 8;
190 8190 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
191 8190 inputVectorPtr += 8;
192 8190 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
193 8190 inputVectorPtr += 8;
194 8190 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
195 8190 inputVectorPtr += 8;
196
197 32760 inputVal1 = _mm256_max_ps(
198 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
199 vmax_val),
200 vmin_val);
201 32760 inputVal2 = _mm256_max_ps(
202 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
203 vmax_val),
204 vmin_val);
205 32760 inputVal3 = _mm256_max_ps(
206 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
207 vmax_val),
208 vmin_val);
209 32760 inputVal4 = _mm256_max_ps(
210 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
211 vmax_val),
212 vmin_val);
213
214 8190 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
215 8190 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
216 8190 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
217 8190 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
218
219 8190 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
220 8190 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
221 8190 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
222 8190 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
223
224 8190 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
225 8190 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
226
227 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
228 8190 outputVectorPtr += 32;
229 }
230
231
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
232 62 float r = inputVector[number] * scale + bias;
233 62 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
234 }
235 2 }
236
237 #endif /* LV_HAVE_AVX2 */
238
239
240 #ifdef LV_HAVE_SSE2
241 #include <emmintrin.h>
242
243 2 static inline void volk_32f_s32f_x2_convert_8u_u_sse2(uint8_t* outputVector,
244 const float* inputVector,
245 const float scale,
246 const float bias,
247 unsigned int num_points)
248 {
249 2 const unsigned int sixteenthPoints = num_points / 16;
250
251 2 const float* inputVectorPtr = (const float*)inputVector;
252 2 uint8_t* outputVectorPtr = outputVector;
253
254 2 const float min_val = 0.0f;
255 2 const float max_val = UINT8_MAX;
256 2 const __m128 vmin_val = _mm_set_ps1(min_val);
257 2 const __m128 vmax_val = _mm_set_ps1(max_val);
258
259 2 const __m128 vScale = _mm_set_ps1(scale);
260 2 const __m128 vBias = _mm_set_ps1(bias);
261
262
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned int number = 0; number < sixteenthPoints; number++) {
263 16382 __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
264 16382 inputVectorPtr += 4;
265 16382 __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
266 16382 inputVectorPtr += 4;
267 16382 __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
268 16382 inputVectorPtr += 4;
269 16382 __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
270 16382 inputVectorPtr += 4;
271
272 65528 inputVal1 = _mm_max_ps(
273 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
274 vmin_val);
275 65528 inputVal2 = _mm_max_ps(
276 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
277 vmin_val);
278 65528 inputVal3 = _mm_max_ps(
279 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
280 vmin_val);
281 65528 inputVal4 = _mm_max_ps(
282 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
283 vmin_val);
284
285 16382 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
286 16382 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
287 16382 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
288 16382 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
289
290 16382 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
291 16382 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
292
293 16382 intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
294
295 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
296 16382 outputVectorPtr += 16;
297 }
298
299
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
300 30 const float r = inputVector[number] * scale + bias;
301 30 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
302 }
303 2 }
304
305 #endif /* LV_HAVE_SSE2 */
306
307
308 #ifdef LV_HAVE_SSE
309 #include <xmmintrin.h>
310
311 2 static inline void volk_32f_s32f_x2_convert_8u_u_sse(uint8_t* outputVector,
312 const float* inputVector,
313 const float scale,
314 const float bias,
315 unsigned int num_points)
316 {
317 2 const unsigned int quarterPoints = num_points / 4;
318
319 2 const float* inputVectorPtr = (const float*)inputVector;
320 2 uint8_t* outputVectorPtr = outputVector;
321
322 2 const float min_val = 0.0f;
323 2 const float max_val = UINT8_MAX;
324 2 const __m128 vmin_val = _mm_set_ps1(min_val);
325 2 const __m128 vmax_val = _mm_set_ps1(max_val);
326
327 2 const __m128 vScale = _mm_set_ps1(scale);
328 2 const __m128 vBias = _mm_set_ps1(bias);
329
330 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
331
332
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (unsigned int number = 0; number < quarterPoints; number++) {
333 65534 __m128 ret = _mm_loadu_ps(inputVectorPtr);
334 65534 inputVectorPtr += 4;
335
336 262136 ret = _mm_max_ps(_mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScale), vBias), vmax_val),
337 vmin_val);
338
339 _mm_store_ps(outputFloatBuffer, ret);
340
2/2
✓ Branch 0 taken 262136 times.
✓ Branch 1 taken 65534 times.
327670 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
341 262136 *outputVectorPtr++ = (uint8_t)(rintf(outputFloatBuffer[inner_loop]));
342 }
343 }
344
345
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
346 6 const float r = inputVector[number] * scale + bias;
347 6 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
348 }
349 2 }
350
351 #endif /* LV_HAVE_SSE */
352
353
354 #endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_u_H */
355 #ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
356 #define INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
357
358 #include <inttypes.h>
359 #include <volk/volk_common.h>
360
361 #if LV_HAVE_AVX2 && LV_HAVE_FMA
362 #include <immintrin.h>
363
364 2 static inline void volk_32f_s32f_x2_convert_8u_a_avx2_fma(uint8_t* outputVector,
365 const float* inputVector,
366 const float scale,
367 const float bias,
368 unsigned int num_points)
369 {
370 2 const unsigned int thirtysecondPoints = num_points / 32;
371
372 2 const float* inputVectorPtr = (const float*)inputVector;
373 2 uint8_t* outputVectorPtr = outputVector;
374
375 2 const float min_val = 0.0f;
376 2 const float max_val = UINT8_MAX;
377 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
378 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
379
380 2 const __m256 vScale = _mm256_set1_ps(scale);
381 2 const __m256 vBias = _mm256_set1_ps(bias);
382
383
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
384 8190 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
385 8190 inputVectorPtr += 8;
386 8190 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
387 8190 inputVectorPtr += 8;
388 8190 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
389 8190 inputVectorPtr += 8;
390 8190 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
391 8190 inputVectorPtr += 8;
392
393 24570 inputVal1 = _mm256_max_ps(
394 _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
395 24570 inputVal2 = _mm256_max_ps(
396 _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
397 24570 inputVal3 = _mm256_max_ps(
398 _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
399 24570 inputVal4 = _mm256_max_ps(
400 _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
401
402 8190 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
403 8190 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
404 8190 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
405 8190 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
406
407 8190 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
408 8190 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
409 8190 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
410 8190 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
411
412 8190 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
413 8190 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
414
415 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
416 8190 outputVectorPtr += 32;
417 }
418
419
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
420 62 const float r = inputVector[number] * scale + bias;
421 62 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
422 }
423 2 }
424
425 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
426
427
428 #ifdef LV_HAVE_AVX2
429 #include <immintrin.h>
430
431 2 static inline void volk_32f_s32f_x2_convert_8u_a_avx2(uint8_t* outputVector,
432 const float* inputVector,
433 const float scale,
434 const float bias,
435 unsigned int num_points)
436 {
437 2 const unsigned int thirtysecondPoints = num_points / 32;
438
439 2 const float* inputVectorPtr = (const float*)inputVector;
440 2 uint8_t* outputVectorPtr = outputVector;
441
442 2 const float min_val = 0.0f;
443 2 const float max_val = UINT8_MAX;
444 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
445 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
446
447 2 const __m256 vScale = _mm256_set1_ps(scale);
448 2 const __m256 vBias = _mm256_set1_ps(bias);
449
450
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
451 8190 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
452 8190 inputVectorPtr += 8;
453 8190 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
454 8190 inputVectorPtr += 8;
455 8190 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
456 8190 inputVectorPtr += 8;
457 8190 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
458 8190 inputVectorPtr += 8;
459
460 32760 inputVal1 = _mm256_max_ps(
461 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
462 vmax_val),
463 vmin_val);
464 32760 inputVal2 = _mm256_max_ps(
465 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
466 vmax_val),
467 vmin_val);
468 32760 inputVal3 = _mm256_max_ps(
469 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
470 vmax_val),
471 vmin_val);
472 32760 inputVal4 = _mm256_max_ps(
473 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
474 vmax_val),
475 vmin_val);
476
477 8190 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
478 8190 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
479 8190 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
480 8190 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
481
482 8190 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
483 8190 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
484 8190 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
485 8190 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
486
487 8190 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
488 8190 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
489
490 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
491 8190 outputVectorPtr += 32;
492 }
493
494
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
495 62 const float r = inputVector[number] * scale + bias;
496 62 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
497 }
498 2 }
499
500 #endif /* LV_HAVE_AVX2 */
501
502
503 #ifdef LV_HAVE_SSE2
504 #include <emmintrin.h>
505
506 2 static inline void volk_32f_s32f_x2_convert_8u_a_sse2(uint8_t* outputVector,
507 const float* inputVector,
508 const float scale,
509 const float bias,
510 unsigned int num_points)
511 {
512 2 const unsigned int sixteenthPoints = num_points / 16;
513
514 2 const float* inputVectorPtr = (const float*)inputVector;
515 2 uint8_t* outputVectorPtr = outputVector;
516
517 2 const float min_val = 0.0f;
518 2 const float max_val = UINT8_MAX;
519 2 const __m128 vmin_val = _mm_set_ps1(min_val);
520 2 const __m128 vmax_val = _mm_set_ps1(max_val);
521
522 2 const __m128 vScale = _mm_set_ps1(scale);
523 2 const __m128 vBias = _mm_set_ps1(bias);
524
525
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned int number = 0; number < sixteenthPoints; number++) {
526 16382 __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
527 16382 inputVectorPtr += 4;
528 16382 __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
529 16382 inputVectorPtr += 4;
530 16382 __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
531 16382 inputVectorPtr += 4;
532 16382 __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
533 16382 inputVectorPtr += 4;
534
535 65528 inputVal1 = _mm_max_ps(
536 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
537 vmin_val);
538 65528 inputVal2 = _mm_max_ps(
539 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
540 vmin_val);
541 65528 inputVal3 = _mm_max_ps(
542 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
543 vmin_val);
544 65528 inputVal4 = _mm_max_ps(
545 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
546 vmin_val);
547
548 16382 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
549 16382 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
550 16382 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
551 16382 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
552
553 16382 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
554 16382 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
555
556 16382 intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
557
558 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
559 16382 outputVectorPtr += 16;
560 }
561
562
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
563 30 const float r = inputVector[number] * scale + bias;
564 30 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
565 }
566 2 }
567 #endif /* LV_HAVE_SSE2 */
568
569
570 #ifdef LV_HAVE_SSE
571 #include <xmmintrin.h>
572
573 2 static inline void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t* outputVector,
574 const float* inputVector,
575 const float scale,
576 const float bias,
577 unsigned int num_points)
578 {
579 2 const unsigned int quarterPoints = num_points / 4;
580
581 2 const float* inputVectorPtr = (const float*)inputVector;
582 2 uint8_t* outputVectorPtr = outputVector;
583
584 2 const float min_val = 0.0f;
585 2 const float max_val = UINT8_MAX;
586 2 const __m128 vmin_val = _mm_set_ps1(min_val);
587 2 const __m128 vmax_val = _mm_set_ps1(max_val);
588
589 2 const __m128 vScalar = _mm_set_ps1(scale);
590 2 const __m128 vBias = _mm_set_ps1(bias);
591
592 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
593
594
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (unsigned int number = 0; number < quarterPoints; number++) {
595 65534 __m128 ret = _mm_load_ps(inputVectorPtr);
596 65534 inputVectorPtr += 4;
597
598 262136 ret = _mm_max_ps(
599 _mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScalar), vBias), vmax_val), vmin_val);
600
601 _mm_store_ps(outputFloatBuffer, ret);
602
2/2
✓ Branch 0 taken 262136 times.
✓ Branch 1 taken 65534 times.
327670 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
603 262136 *outputVectorPtr++ = (uint8_t)(rintf(outputFloatBuffer[inner_loop]));
604 }
605 }
606
607
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
608 6 const float r = inputVector[number] * scale + bias;
609 6 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
610 }
611 2 }
612
613 #endif /* LV_HAVE_SSE */
614
615
616 #endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_a_H */
617