GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16i_s32f_convert_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 176 176 100.0%
Functions: 10 10 100.0%
Branches: 36 36 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16i_s32f_convert_32f
12 *
13 * \b Overview
14 *
15 * Converts 16-bit shorts to scaled 32-bit floating point values.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const
20 * float scalar, unsigned int num_points); \endcode
21 *
22 * \b Inputs
23 * \li inputVector: The input vector of 16-bit shorts.
24 * \li scalar: The value divided against each point in the output buffer.
25 * \li num_points: The number of complex data points.
26 *
27 * \b Outputs
28 * \li outputVector: The output vector of 8-bit chars.
29 *
30 * \b Example
31 * \code
32 * int N = 10000;
33 *
34 * volk_16i_s32f_convert_32f();
35 *
36 * volk_free(x);
37 * volk_free(t);
38 * \endcode
39 */
40
41 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
43
44 #include <inttypes.h>
45 #include <stdio.h>
46
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49
50 2 static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
51 const int16_t* inputVector,
52 const float scalar,
53 unsigned int num_points)
54 {
55 2 unsigned int number = 0;
56 2 const unsigned int eighthPoints = num_points / 8;
57
58 2 float* outputVectorPtr = outputVector;
59 2 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
60 2 int16_t* inputPtr = (int16_t*)inputVector;
61 __m128i inputVal;
62 __m256i inputVal2;
63 __m256 ret;
64
65
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
66
67 // Load the 8 values
68 32766 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
69
70 // Convert
71 32766 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
72
73 32766 ret = _mm256_cvtepi32_ps(inputVal2);
74 32766 ret = _mm256_mul_ps(ret, invScalar);
75
76 _mm256_storeu_ps(outputVectorPtr, ret);
77
78 32766 outputVectorPtr += 8;
79
80 32766 inputPtr += 8;
81 }
82
83 2 number = eighthPoints * 8;
84
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
85 14 outputVector[number] = ((float)(inputVector[number])) / scalar;
86 }
87 2 }
88 #endif /* LV_HAVE_AVX2 */
89
90 #ifdef LV_HAVE_AVX
91 #include <immintrin.h>
92
93 2 static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
94 const int16_t* inputVector,
95 const float scalar,
96 unsigned int num_points)
97 {
98 2 unsigned int number = 0;
99 2 const unsigned int eighthPoints = num_points / 8;
100
101 2 float* outputVectorPtr = outputVector;
102 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
103 2 int16_t* inputPtr = (int16_t*)inputVector;
104 __m128i inputVal, inputVal2;
105 __m128 ret;
106 __m256 output;
107 2 __m256 dummy = _mm256_setzero_ps();
108
109
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
110
111 // Load the 8 values
112 // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
113 32766 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
114
115 // Shift the input data to the right by 64 bits ( 8 bytes )
116 32766 inputVal2 = _mm_srli_si128(inputVal, 8);
117
118 // Convert the lower 4 values into 32 bit words
119 32766 inputVal = _mm_cvtepi16_epi32(inputVal);
120 32766 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
121
122 32766 ret = _mm_cvtepi32_ps(inputVal);
123 32766 ret = _mm_mul_ps(ret, invScalar);
124 32766 output = _mm256_insertf128_ps(dummy, ret, 0);
125
126 32766 ret = _mm_cvtepi32_ps(inputVal2);
127 32766 ret = _mm_mul_ps(ret, invScalar);
128 32766 output = _mm256_insertf128_ps(output, ret, 1);
129
130 _mm256_storeu_ps(outputVectorPtr, output);
131
132 32766 outputVectorPtr += 8;
133
134 32766 inputPtr += 8;
135 }
136
137 2 number = eighthPoints * 8;
138
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
139 14 outputVector[number] = ((float)(inputVector[number])) / scalar;
140 }
141 2 }
142 #endif /* LV_HAVE_AVX */
143
144 #ifdef LV_HAVE_SSE4_1
145 #include <smmintrin.h>
146
147 2 static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
148 const int16_t* inputVector,
149 const float scalar,
150 unsigned int num_points)
151 {
152 2 unsigned int number = 0;
153 2 const unsigned int eighthPoints = num_points / 8;
154
155 2 float* outputVectorPtr = outputVector;
156 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157 2 int16_t* inputPtr = (int16_t*)inputVector;
158 __m128i inputVal;
159 __m128i inputVal2;
160 __m128 ret;
161
162
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
163
164 // Load the 8 values
165 32766 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
166
167 // Shift the input data to the right by 64 bits ( 8 bytes )
168 32766 inputVal2 = _mm_srli_si128(inputVal, 8);
169
170 // Convert the lower 4 values into 32 bit words
171 32766 inputVal = _mm_cvtepi16_epi32(inputVal);
172 32766 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
173
174 32766 ret = _mm_cvtepi32_ps(inputVal);
175 32766 ret = _mm_mul_ps(ret, invScalar);
176 _mm_storeu_ps(outputVectorPtr, ret);
177 32766 outputVectorPtr += 4;
178
179 32766 ret = _mm_cvtepi32_ps(inputVal2);
180 32766 ret = _mm_mul_ps(ret, invScalar);
181 _mm_storeu_ps(outputVectorPtr, ret);
182
183 32766 outputVectorPtr += 4;
184
185 32766 inputPtr += 8;
186 }
187
188 2 number = eighthPoints * 8;
189
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
190 14 outputVector[number] = ((float)(inputVector[number])) / scalar;
191 }
192 2 }
193 #endif /* LV_HAVE_SSE4_1 */
194
195 #ifdef LV_HAVE_SSE
196 #include <xmmintrin.h>
197
198 2 static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
199 const int16_t* inputVector,
200 const float scalar,
201 unsigned int num_points)
202 {
203 2 unsigned int number = 0;
204 2 const unsigned int quarterPoints = num_points / 4;
205
206 2 float* outputVectorPtr = outputVector;
207 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
208 2 int16_t* inputPtr = (int16_t*)inputVector;
209 __m128 ret;
210
211
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
212 65534 ret = _mm_set_ps((float)(inputPtr[3]),
213 65534 (float)(inputPtr[2]),
214 65534 (float)(inputPtr[1]),
215 65534 (float)(inputPtr[0]));
216
217 65534 ret = _mm_mul_ps(ret, invScalar);
218 _mm_storeu_ps(outputVectorPtr, ret);
219
220 65534 inputPtr += 4;
221 65534 outputVectorPtr += 4;
222 }
223
224 2 number = quarterPoints * 4;
225
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
226 6 outputVector[number] = (float)(inputVector[number]) / scalar;
227 }
228 2 }
229 #endif /* LV_HAVE_SSE */
230
231 #ifdef LV_HAVE_GENERIC
232
233 2 static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
234 const int16_t* inputVector,
235 const float scalar,
236 unsigned int num_points)
237 {
238 2 float* outputVectorPtr = outputVector;
239 2 const int16_t* inputVectorPtr = inputVector;
240 2 unsigned int number = 0;
241
242
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
243 262142 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
244 }
245 2 }
246 #endif /* LV_HAVE_GENERIC */
247
248 #ifdef LV_HAVE_NEON
249 #include <arm_neon.h>
250
251 static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
252 const int16_t* inputVector,
253 const float scalar,
254 unsigned int num_points)
255 {
256 float* outputPtr = outputVector;
257 const int16_t* inputPtr = inputVector;
258 unsigned int number = 0;
259 unsigned int eighth_points = num_points / 8;
260
261 int16x4x2_t input16;
262 int32x4_t input32_0, input32_1;
263 float32x4_t input_float_0, input_float_1;
264 float32x4x2_t output_float;
265 float32x4_t inv_scale;
266
267 inv_scale = vdupq_n_f32(1.0 / scalar);
268
269 // the generic disassembles to a 128-bit load
270 // and duplicates every instruction to operate on 64-bits
271 // at a time. This is only possible with lanes, which is faster
272 // than just doing a vld1_s16, but still slower.
273 for (number = 0; number < eighth_points; number++) {
274 input16 = vld2_s16(inputPtr);
275 // widen 16-bit int to 32-bit int
276 input32_0 = vmovl_s16(input16.val[0]);
277 input32_1 = vmovl_s16(input16.val[1]);
278 // convert 32-bit int to float with scale
279 input_float_0 = vcvtq_f32_s32(input32_0);
280 input_float_1 = vcvtq_f32_s32(input32_1);
281 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
282 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
283 vst2q_f32(outputPtr, output_float);
284 inputPtr += 8;
285 outputPtr += 8;
286 }
287
288 for (number = eighth_points * 8; number < num_points; number++) {
289 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
290 }
291 }
292 #endif /* LV_HAVE_NEON */
293
294
295 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
296 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
297 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
298
299 #include <inttypes.h>
300 #include <stdio.h>
301
302 #ifdef LV_HAVE_AVX2
303 #include <immintrin.h>
304
305 2 static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
306 const int16_t* inputVector,
307 const float scalar,
308 unsigned int num_points)
309 {
310 2 unsigned int number = 0;
311 2 const unsigned int eighthPoints = num_points / 8;
312
313 2 float* outputVectorPtr = outputVector;
314 2 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
315 2 int16_t* inputPtr = (int16_t*)inputVector;
316 __m128i inputVal;
317 __m256i inputVal2;
318 __m256 ret;
319
320
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
321
322 // Load the 8 values
323 32766 inputVal = _mm_load_si128((__m128i*)inputPtr);
324
325 // Convert
326 32766 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
327
328 32766 ret = _mm256_cvtepi32_ps(inputVal2);
329 32766 ret = _mm256_mul_ps(ret, invScalar);
330
331 _mm256_store_ps(outputVectorPtr, ret);
332
333 32766 outputVectorPtr += 8;
334
335 32766 inputPtr += 8;
336 }
337
338 2 number = eighthPoints * 8;
339
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
340 14 outputVector[number] = ((float)(inputVector[number])) / scalar;
341 }
342 2 }
343 #endif /* LV_HAVE_AVX2 */
344
345 #ifdef LV_HAVE_AVX
346 #include <immintrin.h>
347
348 2 static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
349 const int16_t* inputVector,
350 const float scalar,
351 unsigned int num_points)
352 {
353 2 unsigned int number = 0;
354 2 const unsigned int eighthPoints = num_points / 8;
355
356 2 float* outputVectorPtr = outputVector;
357 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
358 2 int16_t* inputPtr = (int16_t*)inputVector;
359 __m128i inputVal, inputVal2;
360 __m128 ret;
361 __m256 output;
362 2 __m256 dummy = _mm256_setzero_ps();
363
364
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
365
366 // Load the 8 values
367 // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
368 32766 inputVal = _mm_load_si128((__m128i*)inputPtr);
369
370 // Shift the input data to the right by 64 bits ( 8 bytes )
371 32766 inputVal2 = _mm_srli_si128(inputVal, 8);
372
373 // Convert the lower 4 values into 32 bit words
374 32766 inputVal = _mm_cvtepi16_epi32(inputVal);
375 32766 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
376
377 32766 ret = _mm_cvtepi32_ps(inputVal);
378 32766 ret = _mm_mul_ps(ret, invScalar);
379 32766 output = _mm256_insertf128_ps(dummy, ret, 0);
380
381 32766 ret = _mm_cvtepi32_ps(inputVal2);
382 32766 ret = _mm_mul_ps(ret, invScalar);
383 32766 output = _mm256_insertf128_ps(output, ret, 1);
384
385 _mm256_store_ps(outputVectorPtr, output);
386
387 32766 outputVectorPtr += 8;
388
389 32766 inputPtr += 8;
390 }
391
392 2 number = eighthPoints * 8;
393
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
394 14 outputVector[number] = ((float)(inputVector[number])) / scalar;
395 }
396 2 }
397 #endif /* LV_HAVE_AVX */
398
399 #ifdef LV_HAVE_SSE4_1
400 #include <smmintrin.h>
401
402 2 static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
403 const int16_t* inputVector,
404 const float scalar,
405 unsigned int num_points)
406 {
407 2 unsigned int number = 0;
408 2 const unsigned int eighthPoints = num_points / 8;
409
410 2 float* outputVectorPtr = outputVector;
411 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
412 2 int16_t* inputPtr = (int16_t*)inputVector;
413 __m128i inputVal;
414 __m128i inputVal2;
415 __m128 ret;
416
417
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
418
419 // Load the 8 values
420 32766 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
421
422 // Shift the input data to the right by 64 bits ( 8 bytes )
423 32766 inputVal2 = _mm_srli_si128(inputVal, 8);
424
425 // Convert the lower 4 values into 32 bit words
426 32766 inputVal = _mm_cvtepi16_epi32(inputVal);
427 32766 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
428
429 32766 ret = _mm_cvtepi32_ps(inputVal);
430 32766 ret = _mm_mul_ps(ret, invScalar);
431 _mm_storeu_ps(outputVectorPtr, ret);
432 32766 outputVectorPtr += 4;
433
434 32766 ret = _mm_cvtepi32_ps(inputVal2);
435 32766 ret = _mm_mul_ps(ret, invScalar);
436 _mm_storeu_ps(outputVectorPtr, ret);
437
438 32766 outputVectorPtr += 4;
439
440 32766 inputPtr += 8;
441 }
442
443 2 number = eighthPoints * 8;
444
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
445 14 outputVector[number] = ((float)(inputVector[number])) / scalar;
446 }
447 2 }
448 #endif /* LV_HAVE_SSE4_1 */
449
450 #ifdef LV_HAVE_SSE
451 #include <xmmintrin.h>
452
453 2 static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
454 const int16_t* inputVector,
455 const float scalar,
456 unsigned int num_points)
457 {
458 2 unsigned int number = 0;
459 2 const unsigned int quarterPoints = num_points / 4;
460
461 2 float* outputVectorPtr = outputVector;
462 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
463 2 int16_t* inputPtr = (int16_t*)inputVector;
464 __m128 ret;
465
466
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
467 65534 ret = _mm_set_ps((float)(inputPtr[3]),
468 65534 (float)(inputPtr[2]),
469 65534 (float)(inputPtr[1]),
470 65534 (float)(inputPtr[0]));
471
472 65534 ret = _mm_mul_ps(ret, invScalar);
473 _mm_storeu_ps(outputVectorPtr, ret);
474
475 65534 inputPtr += 4;
476 65534 outputVectorPtr += 4;
477 }
478
479 2 number = quarterPoints * 4;
480
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
481 6 outputVector[number] = (float)(inputVector[number]) / scalar;
482 }
483 2 }
484 #endif /* LV_HAVE_SSE */
485
486 #ifdef LV_HAVE_GENERIC
487
488 2 static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
489 const int16_t* inputVector,
490 const float scalar,
491 unsigned int num_points)
492 {
493 2 float* outputVectorPtr = outputVector;
494 2 const int16_t* inputVectorPtr = inputVector;
495 2 unsigned int number = 0;
496
497
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
498 262142 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
499 }
500 2 }
501 #endif /* LV_HAVE_GENERIC */
502
503 #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
504