GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_convert_16ic.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 117 127 92.1%
Functions: 5 5 100.0%
Branches: 28 38 73.7%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_convert_16ic
12 *
13 * \b Overview
14 *
15 * Converts a complex vector of 32-bits float each component into
16 * a complex vector of 16-bits integer each component.
17 * Values are saturated to the limit values of the output data type.
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
22 * unsigned int num_points); \endcode
23 *
24 * \b Inputs
25 * \li inputVector: The complex 32-bit float input data buffer.
26 * \li num_points: The number of data values to be converted.
27 *
28 * \b Outputs
29 * \li outputVector: The complex 16-bit integer output data buffer.
30 *
31 */
32
33 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34 #define INCLUDED_volk_32fc_convert_16ic_a_H
35
36 #include "volk/volk_complex.h"
37 #include <limits.h>
38 #include <math.h>
39
40 #ifdef LV_HAVE_AVX2
41 #include <immintrin.h>
42
43 2 static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
44 const lv_32fc_t* inputVector,
45 unsigned int num_points)
46 {
47 2 const unsigned int avx_iters = num_points / 8;
48
49 2 float* inputVectorPtr = (float*)inputVector;
50 2 int16_t* outputVectorPtr = (int16_t*)outputVector;
51 float aux;
52
53 2 const float min_val = (float)SHRT_MIN;
54 2 const float max_val = (float)SHRT_MAX;
55
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
58 __m256 ret1, ret2;
59 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
61 unsigned int i;
62
63
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < avx_iters; i++) {
64 32766 inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
65 32766 inputVectorPtr += 8;
66 32766 inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
67 32766 inputVectorPtr += 8;
68 32766 __VOLK_PREFETCH(inputVectorPtr + 16);
69
70 // Clip
71 65532 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 65532 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
73
74 32766 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 32766 intInputVal2 = _mm256_cvtps_epi32(ret2);
76
77 32766 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 32766 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
79
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 32766 outputVectorPtr += 16;
82 }
83
84
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.
30 for (i = avx_iters * 16; i < num_points * 2; i++) {
85 28 aux = *inputVectorPtr++;
86
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
28 if (aux > max_val)
87 aux = max_val;
88
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
28 else if (aux < min_val)
89 aux = min_val;
90 28 *outputVectorPtr++ = (int16_t)rintf(aux);
91 }
92 2 }
93 #endif /* LV_HAVE_AVX2 */
94
95 #ifdef LV_HAVE_SSE2
96 #include <emmintrin.h>
97
98 2 static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
99 const lv_32fc_t* inputVector,
100 unsigned int num_points)
101 {
102 2 const unsigned int sse_iters = num_points / 4;
103
104 2 float* inputVectorPtr = (float*)inputVector;
105 2 int16_t* outputVectorPtr = (int16_t*)outputVector;
106 float aux;
107
108 2 const float min_val = (float)SHRT_MIN;
109 2 const float max_val = (float)SHRT_MAX;
110
111 __m128 inputVal1, inputVal2;
112 __m128i intInputVal1, intInputVal2;
113 __m128 ret1, ret2;
114 2 const __m128 vmin_val = _mm_set_ps1(min_val);
115 2 const __m128 vmax_val = _mm_set_ps1(max_val);
116 unsigned int i;
117
118
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (i = 0; i < sse_iters; i++) {
119 65534 inputVal1 = _mm_load_ps((float*)inputVectorPtr);
120 65534 inputVectorPtr += 4;
121 65534 inputVal2 = _mm_load_ps((float*)inputVectorPtr);
122 65534 inputVectorPtr += 4;
123 65534 __VOLK_PREFETCH(inputVectorPtr + 8);
124
125 // Clip
126 131068 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
127 131068 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
128
129 65534 intInputVal1 = _mm_cvtps_epi32(ret1);
130 65534 intInputVal2 = _mm_cvtps_epi32(ret2);
131
132 65534 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
133
134 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
135 65534 outputVectorPtr += 8;
136 }
137
138
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 2 times.
14 for (i = sse_iters * 8; i < num_points * 2; i++) {
139 12 aux = *inputVectorPtr++;
140
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 if (aux > max_val)
141 aux = max_val;
142
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 else if (aux < min_val)
143 aux = min_val;
144 12 *outputVectorPtr++ = (int16_t)rintf(aux);
145 }
146 2 }
147 #endif /* LV_HAVE_SSE2 */
148
149
150 #if LV_HAVE_NEONV7
151 #include <arm_neon.h>
152
153 #define VCVTRQ_S32_F32(result, value) \
154 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
155 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
156 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
157 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
158
159 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
160 const lv_32fc_t* inputVector,
161 unsigned int num_points)
162 {
163
164 const unsigned int neon_iters = num_points / 4;
165
166 float32_t* inputVectorPtr = (float32_t*)inputVector;
167 int16_t* outputVectorPtr = (int16_t*)outputVector;
168
169 const float min_val_f = (float)SHRT_MIN;
170 const float max_val_f = (float)SHRT_MAX;
171 float32_t aux;
172 unsigned int i;
173
174 const float32x4_t min_val = vmovq_n_f32(min_val_f);
175 const float32x4_t max_val = vmovq_n_f32(max_val_f);
176 float32x4_t ret1, ret2, a, b;
177
178 int32x4_t toint_a = { 0, 0, 0, 0 };
179 int32x4_t toint_b = { 0, 0, 0, 0 };
180 int16x4_t intInputVal1, intInputVal2;
181 int16x8_t res;
182
183 for (i = 0; i < neon_iters; i++) {
184 a = vld1q_f32((const float32_t*)(inputVectorPtr));
185 inputVectorPtr += 4;
186 b = vld1q_f32((const float32_t*)(inputVectorPtr));
187 inputVectorPtr += 4;
188 __VOLK_PREFETCH(inputVectorPtr + 8);
189
190 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
191 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
192
193 // vcvtr takes into account the current rounding mode (as does rintf)
194 VCVTRQ_S32_F32(toint_a, ret1);
195 VCVTRQ_S32_F32(toint_b, ret2);
196
197 intInputVal1 = vqmovn_s32(toint_a);
198 intInputVal2 = vqmovn_s32(toint_b);
199
200 res = vcombine_s16(intInputVal1, intInputVal2);
201 vst1q_s16((int16_t*)outputVectorPtr, res);
202 outputVectorPtr += 8;
203 }
204
205 for (i = neon_iters * 8; i < num_points * 2; i++) {
206 aux = *inputVectorPtr++;
207 if (aux > max_val_f)
208 aux = max_val_f;
209 else if (aux < min_val_f)
210 aux = min_val_f;
211 *outputVectorPtr++ = (int16_t)rintf(aux);
212 }
213 }
214
215 #undef VCVTRQ_S32_F32
216 #endif /* LV_HAVE_NEONV7 */
217
218 #if LV_HAVE_NEONV8
219 #include <arm_neon.h>
220
221 static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
222 const lv_32fc_t* inputVector,
223 unsigned int num_points)
224 {
225 const unsigned int neon_iters = num_points / 4;
226
227 float32_t* inputVectorPtr = (float32_t*)inputVector;
228 int16_t* outputVectorPtr = (int16_t*)outputVector;
229
230 const float min_val_f = (float)SHRT_MIN;
231 const float max_val_f = (float)SHRT_MAX;
232 float32_t aux;
233 unsigned int i;
234
235 const float32x4_t min_val = vmovq_n_f32(min_val_f);
236 const float32x4_t max_val = vmovq_n_f32(max_val_f);
237 float32x4_t ret1, ret2, a, b;
238
239 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240 int16x4_t intInputVal1, intInputVal2;
241 int16x8_t res;
242
243 for (i = 0; i < neon_iters; i++) {
244 a = vld1q_f32((const float32_t*)(inputVectorPtr));
245 inputVectorPtr += 4;
246 b = vld1q_f32((const float32_t*)(inputVectorPtr));
247 inputVectorPtr += 4;
248 __VOLK_PREFETCH(inputVectorPtr + 8);
249
250 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
252
253 // vrndiq takes into account the current rounding mode (as does rintf)
254 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
256
257 intInputVal1 = vqmovn_s32(toint_a);
258 intInputVal2 = vqmovn_s32(toint_b);
259
260 res = vcombine_s16(intInputVal1, intInputVal2);
261 vst1q_s16((int16_t*)outputVectorPtr, res);
262 outputVectorPtr += 8;
263 }
264
265 for (i = neon_iters * 8; i < num_points * 2; i++) {
266 aux = *inputVectorPtr++;
267 if (aux > max_val_f)
268 aux = max_val_f;
269 else if (aux < min_val_f)
270 aux = min_val_f;
271 *outputVectorPtr++ = (int16_t)rintf(aux);
272 }
273 }
274 #endif /* LV_HAVE_NEONV8 */
275
276
277 #ifdef LV_HAVE_GENERIC
278
279 2 static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
280 const lv_32fc_t* inputVector,
281 unsigned int num_points)
282 {
283 2 float* inputVectorPtr = (float*)inputVector;
284 2 int16_t* outputVectorPtr = (int16_t*)outputVector;
285 2 const float min_val = (float)SHRT_MIN;
286 2 const float max_val = (float)SHRT_MAX;
287 float aux;
288 unsigned int i;
289
2/2
✓ Branch 0 taken 524284 times.
✓ Branch 1 taken 2 times.
524286 for (i = 0; i < num_points * 2; i++) {
290 524284 aux = *inputVectorPtr++;
291
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 524284 times.
524284 if (aux > max_val)
292 aux = max_val;
293
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 524284 times.
524284 else if (aux < min_val)
294 aux = min_val;
295 524284 *outputVectorPtr++ = (int16_t)rintf(aux);
296 }
297 2 }
298 #endif /* LV_HAVE_GENERIC */
299
300 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
301
302 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303 #define INCLUDED_volk_32fc_convert_16ic_u_H
304
305 #include "volk/volk_complex.h"
306 #include <limits.h>
307 #include <math.h>
308
309
310 #ifdef LV_HAVE_AVX2
311 #include <immintrin.h>
312
313 2 static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
314 const lv_32fc_t* inputVector,
315 unsigned int num_points)
316 {
317 2 const unsigned int avx_iters = num_points / 8;
318
319 2 float* inputVectorPtr = (float*)inputVector;
320 2 int16_t* outputVectorPtr = (int16_t*)outputVector;
321 float aux;
322
323 2 const float min_val = (float)SHRT_MIN;
324 2 const float max_val = (float)SHRT_MAX;
325
326 __m256 inputVal1, inputVal2;
327 __m256i intInputVal1, intInputVal2;
328 __m256 ret1, ret2;
329 2 const __m256 vmin_val = _mm256_set1_ps(min_val);
330 2 const __m256 vmax_val = _mm256_set1_ps(max_val);
331 unsigned int i;
332
333
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < avx_iters; i++) {
334 32766 inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
335 32766 inputVectorPtr += 8;
336 32766 inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
337 32766 inputVectorPtr += 8;
338 32766 __VOLK_PREFETCH(inputVectorPtr + 16);
339
340 // Clip
341 65532 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342 65532 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
343
344 32766 intInputVal1 = _mm256_cvtps_epi32(ret1);
345 32766 intInputVal2 = _mm256_cvtps_epi32(ret2);
346
347 32766 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348 32766 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
349
350 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351 32766 outputVectorPtr += 16;
352 }
353
354
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.
30 for (i = avx_iters * 16; i < num_points * 2; i++) {
355 28 aux = *inputVectorPtr++;
356
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
28 if (aux > max_val)
357 aux = max_val;
358
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
28 else if (aux < min_val)
359 aux = min_val;
360 28 *outputVectorPtr++ = (int16_t)rintf(aux);
361 }
362 2 }
363 #endif /* LV_HAVE_AVX2 */
364
365
366 #ifdef LV_HAVE_SSE2
367 #include <emmintrin.h>
368
369 2 static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
370 const lv_32fc_t* inputVector,
371 unsigned int num_points)
372 {
373 2 const unsigned int sse_iters = num_points / 4;
374
375 2 float* inputVectorPtr = (float*)inputVector;
376 2 int16_t* outputVectorPtr = (int16_t*)outputVector;
377 float aux;
378
379 2 const float min_val = (float)SHRT_MIN;
380 2 const float max_val = (float)SHRT_MAX;
381
382 __m128 inputVal1, inputVal2;
383 __m128i intInputVal1, intInputVal2;
384 __m128 ret1, ret2;
385 2 const __m128 vmin_val = _mm_set_ps1(min_val);
386 2 const __m128 vmax_val = _mm_set_ps1(max_val);
387
388 unsigned int i;
389
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (i = 0; i < sse_iters; i++) {
390 65534 inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
391 65534 inputVectorPtr += 4;
392 65534 inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
393 65534 inputVectorPtr += 4;
394 65534 __VOLK_PREFETCH(inputVectorPtr + 8);
395
396 // Clip
397 131068 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
398 131068 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
399
400 65534 intInputVal1 = _mm_cvtps_epi32(ret1);
401 65534 intInputVal2 = _mm_cvtps_epi32(ret2);
402
403 65534 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
404
405 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
406 65534 outputVectorPtr += 8;
407 }
408
409
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 2 times.
14 for (i = sse_iters * 8; i < num_points * 2; i++) {
410 12 aux = *inputVectorPtr++;
411
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 if (aux > max_val)
412 aux = max_val;
413
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 else if (aux < min_val)
414 aux = min_val;
415 12 *outputVectorPtr++ = (int16_t)rintf(aux);
416 }
417 2 }
418 #endif /* LV_HAVE_SSE2 */
419 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
420