GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_s32f_convert_8i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 195 195 100.0%
Functions: 8 8 100.0%
Branches: 34 34 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_s32f_convert_8i
12 *
13 * \b Overview
14 *
15 * Converts a floating point number to a 8-bit int after applying a
16 * scaling factor.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
21 float scalar, unsigned int num_points)
22 * \endcode
23 *
24 * \b Inputs
25 * \li inputVector: the input vector of floats.
26 * \li scalar: The value multiplied against each point in the input buffer.
27 * \li num_points: The number of data points.
28 *
29 * \b Outputs
30 * \li outputVector: The output vector.
31 *
32 * \b Example
33 * Convert floats from [-1,1] to 8-bit integers with a scale of 5 to maintain smallest
34 delta
35 * int N = 10;
36 * unsigned int alignment = volk_get_alignment();
37 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
38 * int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
39 *
40 * for(unsigned int ii = 0; ii < N; ++ii){
41 * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
42 * }
43 *
44 * // Normalize by the smallest delta (0.2 in this example)
45 * // With float -> 8 bit ints be careful of scaling
46
47 * float scale = 5.1f;
48 *
49 * volk_32f_s32f_convert_8i(out, increasing, scale, N);
50 *
51 * for(unsigned int ii = 0; ii < N; ++ii){
52 * printf("out[%u] = %i\n", ii, out[ii]);
53 * }
54 *
55 * volk_free(increasing);
56 * volk_free(out);
57 * \endcode
58 */
59
60 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
62
63 #include <inttypes.h>
64
65 262338 static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
66 {
67 262338 const float min_val = INT8_MIN;
68 262338 const float max_val = INT8_MAX;
69
2/2
✓ Branch 0 taken 79941 times.
✓ Branch 1 taken 182397 times.
262338 if (in > max_val) {
70 79941 *out = (int8_t)(max_val);
71
2/2
✓ Branch 0 taken 79659 times.
✓ Branch 1 taken 102738 times.
182397 } else if (in < min_val) {
72 79659 *out = (int8_t)(min_val);
73 } else {
74 102738 *out = (int8_t)(rintf(in));
75 }
76 262338 }
77
78 #ifdef LV_HAVE_GENERIC
79
80 2 static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
81 const float* inputVector,
82 const float scalar,
83 unsigned int num_points)
84 {
85 2 const float* inputVectorPtr = inputVector;
86
87
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (unsigned int number = 0; number < num_points; number++) {
88 262142 const float r = *inputVectorPtr++ * scalar;
89 262142 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
90 }
91 2 }
92
93 #endif /* LV_HAVE_GENERIC */
94
95
96 #ifdef LV_HAVE_AVX2
97 #include <immintrin.h>
98
99 2 static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
100 const float* inputVector,
101 const float scalar,
102 unsigned int num_points)
103 {
104 2 const unsigned int thirtysecondPoints = num_points / 32;
105
106 2 const float* inputVectorPtr = (const float*)inputVector;
107 2 int8_t* outputVectorPtr = outputVector;
108
109 2 const float min_val = INT8_MIN;
110 2 const float max_val = INT8_MAX;
111 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
112 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
113
114 2 const __m256 vScalar = _mm256_set1_ps(scalar);
115
116
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
117 8190 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
118 8190 inputVectorPtr += 8;
119 8190 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
120 8190 inputVectorPtr += 8;
121 8190 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
122 8190 inputVectorPtr += 8;
123 8190 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
124 8190 inputVectorPtr += 8;
125
126 24570 inputVal1 = _mm256_max_ps(
127 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
128 24570 inputVal2 = _mm256_max_ps(
129 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
130 24570 inputVal3 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
132 24570 inputVal4 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
134
135 8190 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
136 8190 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
137 8190 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
138 8190 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
139
140 8190 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
141 8190 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
142 8190 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
143 8190 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
144
145 8190 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
146 8190 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
147
148 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
149 8190 outputVectorPtr += 32;
150 }
151
152
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
153 62 float r = inputVector[number] * scalar;
154 62 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
155 }
156 2 }
157
158 #endif /* LV_HAVE_AVX2 */
159
160
161 #ifdef LV_HAVE_SSE2
162 #include <emmintrin.h>
163
164 2 static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
165 const float* inputVector,
166 const float scalar,
167 unsigned int num_points)
168 {
169 2 const unsigned int sixteenthPoints = num_points / 16;
170
171 2 const float* inputVectorPtr = (const float*)inputVector;
172 2 int8_t* outputVectorPtr = outputVector;
173
174 2 const float min_val = INT8_MIN;
175 2 const float max_val = INT8_MAX;
176 2 const __m128 vmin_val = _mm_set_ps1(min_val);
177 2 const __m128 vmax_val = _mm_set_ps1(max_val);
178
179 2 const __m128 vScalar = _mm_set_ps1(scalar);
180
181
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned int number = 0; number < sixteenthPoints; number++) {
182 16382 __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
183 16382 inputVectorPtr += 4;
184 16382 __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
185 16382 inputVectorPtr += 4;
186 16382 __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
187 16382 inputVectorPtr += 4;
188 16382 __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
189 16382 inputVectorPtr += 4;
190
191 inputVal1 =
192 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
193 inputVal2 =
194 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
195 inputVal3 =
196 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
197 inputVal4 =
198 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
199
200 16382 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
201 16382 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
202 16382 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
203 16382 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
204
205 16382 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
206 16382 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
207
208 16382 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
209
210 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
211 16382 outputVectorPtr += 16;
212 }
213
214
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
215 30 const float r = inputVector[number] * scalar;
216 30 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
217 }
218 2 }
219
220 #endif /* LV_HAVE_SSE2 */
221
222
223 #ifdef LV_HAVE_SSE
224 #include <xmmintrin.h>
225
226 2 static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
227 const float* inputVector,
228 const float scalar,
229 unsigned int num_points)
230 {
231 2 const unsigned int quarterPoints = num_points / 4;
232
233 2 const float* inputVectorPtr = (const float*)inputVector;
234 2 int8_t* outputVectorPtr = outputVector;
235
236 2 const float min_val = INT8_MIN;
237 2 const float max_val = INT8_MAX;
238 2 const __m128 vmin_val = _mm_set_ps1(min_val);
239 2 const __m128 vmax_val = _mm_set_ps1(max_val);
240
241 2 const __m128 vScalar = _mm_set_ps1(scalar);
242
243 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
244
245
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (unsigned int number = 0; number < quarterPoints; number++) {
246 65534 __m128 ret = _mm_loadu_ps(inputVectorPtr);
247 65534 inputVectorPtr += 4;
248
249 196602 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
250
251 _mm_store_ps(outputFloatBuffer, ret);
252
2/2
✓ Branch 0 taken 262136 times.
✓ Branch 1 taken 65534 times.
327670 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
253 262136 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
254 }
255 }
256
257
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
258 6 const float r = inputVector[number] * scalar;
259 6 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
260 }
261 2 }
262
263 #endif /* LV_HAVE_SSE */
264
265
266 #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
267 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
268 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
269
270 #include <inttypes.h>
271
272 #ifdef LV_HAVE_AVX2
273 #include <immintrin.h>
274
275 2 static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
276 const float* inputVector,
277 const float scalar,
278 unsigned int num_points)
279 {
280 2 const unsigned int thirtysecondPoints = num_points / 32;
281
282 2 const float* inputVectorPtr = (const float*)inputVector;
283 2 int8_t* outputVectorPtr = outputVector;
284
285 2 const float min_val = INT8_MIN;
286 2 const float max_val = INT8_MAX;
287 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
288 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
289
290 2 const __m256 vScalar = _mm256_set1_ps(scalar);
291
292
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
293 8190 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
294 8190 inputVectorPtr += 8;
295 8190 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
296 8190 inputVectorPtr += 8;
297 8190 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
298 8190 inputVectorPtr += 8;
299 8190 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
300 8190 inputVectorPtr += 8;
301
302 24570 inputVal1 = _mm256_max_ps(
303 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
304 24570 inputVal2 = _mm256_max_ps(
305 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
306 24570 inputVal3 = _mm256_max_ps(
307 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
308 32760 inputVal4 = _mm256_max_ps(
309 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
310
311 8190 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
312 8190 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
313 8190 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
314 8190 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
315
316 8190 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
317 8190 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
318 8190 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
319 8190 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
320
321 8190 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
322 8190 __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
323
324 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
325 8190 outputVectorPtr += 32;
326 }
327
328
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
329 62 const float r = inputVector[number] * scalar;
330 62 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
331 }
332 2 }
333
334 #endif /* LV_HAVE_AVX2 */
335
336
337 #ifdef LV_HAVE_SSE2
338 #include <emmintrin.h>
339
340 2 static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
341 const float* inputVector,
342 const float scalar,
343 unsigned int num_points)
344 {
345 2 const unsigned int sixteenthPoints = num_points / 16;
346
347 2 const float* inputVectorPtr = (const float*)inputVector;
348 2 int8_t* outputVectorPtr = outputVector;
349
350 2 const float min_val = INT8_MIN;
351 2 const float max_val = INT8_MAX;
352 2 const __m128 vmin_val = _mm_set_ps1(min_val);
353 2 const __m128 vmax_val = _mm_set_ps1(max_val);
354
355 2 const __m128 vScalar = _mm_set_ps1(scalar);
356
357
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned int number = 0; number < sixteenthPoints; number++) {
358 16382 __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
359 16382 inputVectorPtr += 4;
360 16382 __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
361 16382 inputVectorPtr += 4;
362 16382 __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
363 16382 inputVectorPtr += 4;
364 16382 __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
365 16382 inputVectorPtr += 4;
366
367 inputVal1 =
368 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
369 inputVal2 =
370 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
371 inputVal3 =
372 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
373 inputVal4 =
374 49146 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
375
376 16382 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
377 16382 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
378 16382 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
379 16382 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
380
381 16382 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
382 16382 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
383
384 16382 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
385
386 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
387 16382 outputVectorPtr += 16;
388 }
389
390
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
391 30 const float r = inputVector[number] * scalar;
392 30 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
393 }
394 2 }
395 #endif /* LV_HAVE_SSE2 */
396
397
398 #ifdef LV_HAVE_SSE
399 #include <xmmintrin.h>
400
401 2 static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
402 const float* inputVector,
403 const float scalar,
404 unsigned int num_points)
405 {
406 2 const unsigned int quarterPoints = num_points / 4;
407
408 2 const float* inputVectorPtr = (const float*)inputVector;
409 2 int8_t* outputVectorPtr = outputVector;
410
411 2 const float min_val = INT8_MIN;
412 2 const float max_val = INT8_MAX;
413 2 const __m128 vmin_val = _mm_set_ps1(min_val);
414 2 const __m128 vmax_val = _mm_set_ps1(max_val);
415
416 2 const __m128 vScalar = _mm_set_ps1(scalar);
417
418 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
419
420
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (unsigned int number = 0; number < quarterPoints; number++) {
421 65534 __m128 ret = _mm_load_ps(inputVectorPtr);
422 65534 inputVectorPtr += 4;
423
424 196602 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
425
426 _mm_store_ps(outputFloatBuffer, ret);
427
2/2
✓ Branch 0 taken 262136 times.
✓ Branch 1 taken 65534 times.
327670 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
428 262136 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
429 }
430 }
431
432
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
433 6 const float r = inputVector[number] * scalar;
434 6 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
435 }
436 2 }
437
438 #endif /* LV_HAVE_SSE */
439
440
441 #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
442