GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_8i_s32f_convert_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 132 132 100.0%
Functions: 7 7 100.0%
Branches: 20 20 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_8i_s32f_convert_32f
12 *
13 * \b Overview
14 *
15 * Convert the input vector of 8-bit chars to a vector of floats. The
16 * floats are then divided by the scalar factor. shorts.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
21 * float scalar, unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li inputVector: The input vector of 8-bit chars.
25 * \li scalar: the scaling factor used to divide the results of the conversion.
26 * \li num_points: The number of values.
27 *
28 * \b Outputs
29 * \li outputVector: The output 16-bit shorts.
30 *
31 * \b Example
32 * \code
33 * int N = 10000;
34 *
35 * volk_8i_s32f_convert_32f();
36 *
37 * volk_free(x);
38 * \endcode
39 */
40
41 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
43
44 #include <inttypes.h>
45 #include <stdio.h>
46
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49
50 2 static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51 const int8_t* inputVector,
52 const float scalar,
53 unsigned int num_points)
54 {
55 2 unsigned int number = 0;
56 2 const unsigned int sixteenthPoints = num_points / 16;
57
58 2 float* outputVectorPtr = outputVector;
59 2 const float iScalar = 1.0 / scalar;
60 2 __m256 invScalar = _mm256_set1_ps(iScalar);
61 2 const int8_t* inputVectorPtr = inputVector;
62 __m256 ret;
63 __m128i inputVal128;
64 __m256i interimVal;
65
66
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
67 16382 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68
69 16382 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 16382 ret = _mm256_cvtepi32_ps(interimVal);
71 16382 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
73 16382 outputVectorPtr += 8;
74
75 16382 inputVal128 = _mm_srli_si128(inputVal128, 8);
76 16382 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 16382 ret = _mm256_cvtepi32_ps(interimVal);
78 16382 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
80 16382 outputVectorPtr += 8;
81
82 16382 inputVectorPtr += 16;
83 }
84
85 2 number = sixteenthPoints * 16;
86
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
87 30 outputVector[number] = (float)(inputVector[number]) * iScalar;
88 }
89 2 }
90 #endif /* LV_HAVE_AVX2 */
91
92
93 #ifdef LV_HAVE_SSE4_1
94 #include <smmintrin.h>
95
96 2 static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
97 const int8_t* inputVector,
98 const float scalar,
99 unsigned int num_points)
100 {
101 2 unsigned int number = 0;
102 2 const unsigned int sixteenthPoints = num_points / 16;
103
104 2 float* outputVectorPtr = outputVector;
105 2 const float iScalar = 1.0 / scalar;
106 2 __m128 invScalar = _mm_set_ps1(iScalar);
107 2 const int8_t* inputVectorPtr = inputVector;
108 __m128 ret;
109 __m128i inputVal;
110 __m128i interimVal;
111
112
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
113 16382 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
114
115 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
116 16382 ret = _mm_cvtepi32_ps(interimVal);
117 16382 ret = _mm_mul_ps(ret, invScalar);
118 _mm_storeu_ps(outputVectorPtr, ret);
119 16382 outputVectorPtr += 4;
120
121 16382 inputVal = _mm_srli_si128(inputVal, 4);
122 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
123 16382 ret = _mm_cvtepi32_ps(interimVal);
124 16382 ret = _mm_mul_ps(ret, invScalar);
125 _mm_storeu_ps(outputVectorPtr, ret);
126 16382 outputVectorPtr += 4;
127
128 16382 inputVal = _mm_srli_si128(inputVal, 4);
129 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
130 16382 ret = _mm_cvtepi32_ps(interimVal);
131 16382 ret = _mm_mul_ps(ret, invScalar);
132 _mm_storeu_ps(outputVectorPtr, ret);
133 16382 outputVectorPtr += 4;
134
135 16382 inputVal = _mm_srli_si128(inputVal, 4);
136 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
137 16382 ret = _mm_cvtepi32_ps(interimVal);
138 16382 ret = _mm_mul_ps(ret, invScalar);
139 _mm_storeu_ps(outputVectorPtr, ret);
140 16382 outputVectorPtr += 4;
141
142 16382 inputVectorPtr += 16;
143 }
144
145 2 number = sixteenthPoints * 16;
146
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
147 30 outputVector[number] = (float)(inputVector[number]) * iScalar;
148 }
149 2 }
150 #endif /* LV_HAVE_SSE4_1 */
151
152 #ifdef LV_HAVE_GENERIC
153
154 2 static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
155 const int8_t* inputVector,
156 const float scalar,
157 unsigned int num_points)
158 {
159 2 float* outputVectorPtr = outputVector;
160 2 const int8_t* inputVectorPtr = inputVector;
161 2 unsigned int number = 0;
162 2 const float iScalar = 1.0 / scalar;
163
164
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
165 262142 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
166 }
167 2 }
168 #endif /* LV_HAVE_GENERIC */
169
170
171 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
172
173 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
175
176 #include <inttypes.h>
177 #include <stdio.h>
178
179 #ifdef LV_HAVE_AVX2
180 #include <immintrin.h>
181
182 2 static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
183 const int8_t* inputVector,
184 const float scalar,
185 unsigned int num_points)
186 {
187 2 unsigned int number = 0;
188 2 const unsigned int sixteenthPoints = num_points / 16;
189
190 2 float* outputVectorPtr = outputVector;
191 2 const float iScalar = 1.0 / scalar;
192 2 __m256 invScalar = _mm256_set1_ps(iScalar);
193 2 const int8_t* inputVectorPtr = inputVector;
194 __m256 ret;
195 __m128i inputVal128;
196 __m256i interimVal;
197
198
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
199 16382 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
200
201 16382 interimVal = _mm256_cvtepi8_epi32(inputVal128);
202 16382 ret = _mm256_cvtepi32_ps(interimVal);
203 16382 ret = _mm256_mul_ps(ret, invScalar);
204 _mm256_store_ps(outputVectorPtr, ret);
205 16382 outputVectorPtr += 8;
206
207 16382 inputVal128 = _mm_srli_si128(inputVal128, 8);
208 16382 interimVal = _mm256_cvtepi8_epi32(inputVal128);
209 16382 ret = _mm256_cvtepi32_ps(interimVal);
210 16382 ret = _mm256_mul_ps(ret, invScalar);
211 _mm256_store_ps(outputVectorPtr, ret);
212 16382 outputVectorPtr += 8;
213
214 16382 inputVectorPtr += 16;
215 }
216
217 2 number = sixteenthPoints * 16;
218
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
219 30 outputVector[number] = (float)(inputVector[number]) * iScalar;
220 }
221 2 }
222 #endif /* LV_HAVE_AVX2 */
223
224 #ifdef LV_HAVE_SSE4_1
225 #include <smmintrin.h>
226
227 2 static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
228 const int8_t* inputVector,
229 const float scalar,
230 unsigned int num_points)
231 {
232 2 unsigned int number = 0;
233 2 const unsigned int sixteenthPoints = num_points / 16;
234
235 2 float* outputVectorPtr = outputVector;
236 2 const float iScalar = 1.0 / scalar;
237 2 __m128 invScalar = _mm_set_ps1(iScalar);
238 2 const int8_t* inputVectorPtr = inputVector;
239 __m128 ret;
240 __m128i inputVal;
241 __m128i interimVal;
242
243
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
244 16382 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
245
246 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
247 16382 ret = _mm_cvtepi32_ps(interimVal);
248 16382 ret = _mm_mul_ps(ret, invScalar);
249 _mm_store_ps(outputVectorPtr, ret);
250 16382 outputVectorPtr += 4;
251
252 16382 inputVal = _mm_srli_si128(inputVal, 4);
253 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
254 16382 ret = _mm_cvtepi32_ps(interimVal);
255 16382 ret = _mm_mul_ps(ret, invScalar);
256 _mm_store_ps(outputVectorPtr, ret);
257 16382 outputVectorPtr += 4;
258
259 16382 inputVal = _mm_srli_si128(inputVal, 4);
260 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
261 16382 ret = _mm_cvtepi32_ps(interimVal);
262 16382 ret = _mm_mul_ps(ret, invScalar);
263 _mm_store_ps(outputVectorPtr, ret);
264 16382 outputVectorPtr += 4;
265
266 16382 inputVal = _mm_srli_si128(inputVal, 4);
267 16382 interimVal = _mm_cvtepi8_epi32(inputVal);
268 16382 ret = _mm_cvtepi32_ps(interimVal);
269 16382 ret = _mm_mul_ps(ret, invScalar);
270 _mm_store_ps(outputVectorPtr, ret);
271 16382 outputVectorPtr += 4;
272
273 16382 inputVectorPtr += 16;
274 }
275
276 2 number = sixteenthPoints * 16;
277
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
278 30 outputVector[number] = (float)(inputVector[number]) * iScalar;
279 }
280 2 }
281 #endif /* LV_HAVE_SSE4_1 */
282
283 #ifdef LV_HAVE_NEON
284 #include <arm_neon.h>
285
286 static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
287 const int8_t* inputVector,
288 const float scalar,
289 unsigned int num_points)
290 {
291 float* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
293
294 const float iScalar = 1.0 / scalar;
295 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
296
297 int8x16_t inputVal;
298
299 int16x8_t lower;
300 int16x8_t higher;
301
302 float32x4_t outputFloat;
303
304 unsigned int number = 0;
305 const unsigned int sixteenthPoints = num_points / 16;
306 for (; number < sixteenthPoints; number++) {
307 inputVal = vld1q_s8(inputVectorPtr);
308 inputVectorPtr += 16;
309
310 lower = vmovl_s8(vget_low_s8(inputVal));
311 higher = vmovl_s8(vget_high_s8(inputVal));
312
313 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314 vst1q_f32(outputVectorPtr, outputFloat);
315 outputVectorPtr += 4;
316
317 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318 vst1q_f32(outputVectorPtr, outputFloat);
319 outputVectorPtr += 4;
320
321 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322 vst1q_f32(outputVectorPtr, outputFloat);
323 outputVectorPtr += 4;
324
325 outputFloat =
326 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
329 }
330 for (number = sixteenthPoints * 16; number < num_points; number++) {
331 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
332 }
333 }
334
335 #endif /* LV_HAVE_NEON */
336
337 #ifdef LV_HAVE_GENERIC
338
339 2 static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
340 const int8_t* inputVector,
341 const float scalar,
342 unsigned int num_points)
343 {
344 2 float* outputVectorPtr = outputVector;
345 2 const int8_t* inputVectorPtr = inputVector;
346 2 unsigned int number = 0;
347 2 const float iScalar = 1.0 / scalar;
348
349
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
350 262142 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
351 }
352 2 }
353 #endif /* LV_HAVE_GENERIC */
354
355
356 #ifdef LV_HAVE_ORC
357 extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
358 const int8_t* inputVector,
359 const float scalar,
360 unsigned int num_points);
361
362 2 static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
363 const int8_t* inputVector,
364 const float scalar,
365 unsigned int num_points)
366 {
367 2 float invscalar = 1.0 / scalar;
368 2 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
369 2 }
370 #endif /* LV_HAVE_ORC */
371
372
373 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
374