Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_s32f_convert_16i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Converts a floating point number to a 16-bit short after applying a | ||
16 | * scaling factor. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const | ||
21 | * float scalar, unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li inputVector: the input vector of floats. | ||
25 | * \li scalar: The value multiplied against each point in the input buffer. | ||
26 | * \li num_points: The number of data points. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li outputVector: The output vector. | ||
30 | * | ||
31 | * \b Example | ||
32 | * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest | ||
33 | * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing = | ||
34 | * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out = | ||
35 | * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment); | ||
36 | * | ||
37 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
38 | * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f; | ||
39 | * } | ||
40 | * | ||
41 | * // Normalize by the smallest delta (0.2 in this example) | ||
42 | * float scale = 5.f; | ||
43 | * | ||
44 | * volk_32f_s32f_convert_32i(out, increasing, scale, N); | ||
45 | * | ||
46 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
47 | * printf("out[%u] = %i\n", ii, out[ii]); | ||
48 | * } | ||
49 | * | ||
50 | * volk_free(increasing); | ||
51 | * volk_free(out); | ||
52 | * \endcode | ||
53 | */ | ||
54 | |||
55 | #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H | ||
56 | #define INCLUDED_volk_32f_s32f_convert_16i_u_H | ||
57 | |||
58 | #include <inttypes.h> | ||
59 | #include <limits.h> | ||
60 | #include <stdio.h> | ||
61 | |||
62 | #ifdef LV_HAVE_AVX2 | ||
63 | #include <immintrin.h> | ||
64 | |||
65 | 2 | static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, | |
66 | const float* inputVector, | ||
67 | const float scalar, | ||
68 | unsigned int num_points) | ||
69 | { | ||
70 | 2 | unsigned int number = 0; | |
71 | |||
72 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
73 | |||
74 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
75 | 2 | int16_t* outputVectorPtr = outputVector; | |
76 | |||
77 | 2 | float min_val = SHRT_MIN; | |
78 | 2 | float max_val = SHRT_MAX; | |
79 | float r; | ||
80 | |||
81 | 2 | __m256 vScalar = _mm256_set1_ps(scalar); | |
82 | __m256 inputVal1, inputVal2; | ||
83 | __m256i intInputVal1, intInputVal2; | ||
84 | __m256 ret1, ret2; | ||
85 | 2 | __m256 vmin_val = _mm256_set1_ps(min_val); | |
86 | 2 | __m256 vmax_val = _mm256_set1_ps(max_val); | |
87 | |||
88 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
89 | 16382 | inputVal1 = _mm256_loadu_ps(inputVectorPtr); | |
90 | 16382 | inputVectorPtr += 8; | |
91 | 16382 | inputVal2 = _mm256_loadu_ps(inputVectorPtr); | |
92 | 16382 | inputVectorPtr += 8; | |
93 | |||
94 | // Scale and clip | ||
95 | 49146 | ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), | |
96 | vmin_val); | ||
97 | 49146 | ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), | |
98 | vmin_val); | ||
99 | |||
100 | 16382 | intInputVal1 = _mm256_cvtps_epi32(ret1); | |
101 | 16382 | intInputVal2 = _mm256_cvtps_epi32(ret2); | |
102 | |||
103 | 16382 | intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); | |
104 | 16382 | intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); | |
105 | |||
106 | _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); | ||
107 | 16382 | outputVectorPtr += 16; | |
108 | } | ||
109 | |||
110 | 2 | number = sixteenthPoints * 16; | |
111 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
112 | 30 | r = inputVector[number] * scalar; | |
113 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | if (r > max_val) |
114 | ✗ | r = max_val; | |
115 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | else if (r < min_val) |
116 | ✗ | r = min_val; | |
117 | 30 | outputVector[number] = (int16_t)rintf(r); | |
118 | } | ||
119 | 2 | } | |
120 | #endif /* LV_HAVE_AVX2 */ | ||
121 | |||
122 | |||
123 | #ifdef LV_HAVE_AVX | ||
124 | #include <immintrin.h> | ||
125 | |||
126 | 2 | static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, | |
127 | const float* inputVector, | ||
128 | const float scalar, | ||
129 | unsigned int num_points) | ||
130 | { | ||
131 | 2 | unsigned int number = 0; | |
132 | |||
133 | 2 | const unsigned int eighthPoints = num_points / 8; | |
134 | |||
135 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
136 | 2 | int16_t* outputVectorPtr = outputVector; | |
137 | |||
138 | 2 | float min_val = SHRT_MIN; | |
139 | 2 | float max_val = SHRT_MAX; | |
140 | float r; | ||
141 | |||
142 | 2 | __m256 vScalar = _mm256_set1_ps(scalar); | |
143 | __m256 inputVal, ret; | ||
144 | __m256i intInputVal; | ||
145 | __m128i intInputVal1, intInputVal2; | ||
146 | 2 | __m256 vmin_val = _mm256_set1_ps(min_val); | |
147 | 2 | __m256 vmax_val = _mm256_set1_ps(max_val); | |
148 | |||
149 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
150 | 32766 | inputVal = _mm256_loadu_ps(inputVectorPtr); | |
151 | 32766 | inputVectorPtr += 8; | |
152 | |||
153 | // Scale and clip | ||
154 | 98298 | ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), | |
155 | vmin_val); | ||
156 | |||
157 | 32766 | intInputVal = _mm256_cvtps_epi32(ret); | |
158 | |||
159 | 32766 | intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); | |
160 | 32766 | intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); | |
161 | |||
162 | 32766 | intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); | |
163 | |||
164 | _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); | ||
165 | 32766 | outputVectorPtr += 8; | |
166 | } | ||
167 | |||
168 | 2 | number = eighthPoints * 8; | |
169 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
170 | 14 | r = inputVector[number] * scalar; | |
171 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (r > max_val) |
172 | ✗ | r = max_val; | |
173 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | else if (r < min_val) |
174 | ✗ | r = min_val; | |
175 | 14 | outputVector[number] = (int16_t)rintf(r); | |
176 | } | ||
177 | 2 | } | |
178 | #endif /* LV_HAVE_AVX */ | ||
179 | |||
180 | |||
181 | #ifdef LV_HAVE_SSE2 | ||
182 | #include <emmintrin.h> | ||
183 | |||
184 | 2 | static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, | |
185 | const float* inputVector, | ||
186 | const float scalar, | ||
187 | unsigned int num_points) | ||
188 | { | ||
189 | 2 | unsigned int number = 0; | |
190 | |||
191 | 2 | const unsigned int eighthPoints = num_points / 8; | |
192 | |||
193 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
194 | 2 | int16_t* outputVectorPtr = outputVector; | |
195 | |||
196 | 2 | float min_val = SHRT_MIN; | |
197 | 2 | float max_val = SHRT_MAX; | |
198 | float r; | ||
199 | |||
200 | 2 | __m128 vScalar = _mm_set_ps1(scalar); | |
201 | __m128 inputVal1, inputVal2; | ||
202 | __m128i intInputVal1, intInputVal2; | ||
203 | __m128 ret1, ret2; | ||
204 | 2 | __m128 vmin_val = _mm_set_ps1(min_val); | |
205 | 2 | __m128 vmax_val = _mm_set_ps1(max_val); | |
206 | |||
207 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
208 | 32766 | inputVal1 = _mm_loadu_ps(inputVectorPtr); | |
209 | 32766 | inputVectorPtr += 4; | |
210 | 32766 | inputVal2 = _mm_loadu_ps(inputVectorPtr); | |
211 | 32766 | inputVectorPtr += 4; | |
212 | |||
213 | // Scale and clip | ||
214 | 98298 | ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); | |
215 | 98298 | ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); | |
216 | |||
217 | 32766 | intInputVal1 = _mm_cvtps_epi32(ret1); | |
218 | 32766 | intInputVal2 = _mm_cvtps_epi32(ret2); | |
219 | |||
220 | 32766 | intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); | |
221 | |||
222 | _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); | ||
223 | 32766 | outputVectorPtr += 8; | |
224 | } | ||
225 | |||
226 | 2 | number = eighthPoints * 8; | |
227 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
228 | 14 | r = inputVector[number] * scalar; | |
229 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (r > max_val) |
230 | ✗ | r = max_val; | |
231 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | else if (r < min_val) |
232 | ✗ | r = min_val; | |
233 | 14 | outputVector[number] = (int16_t)rintf(r); | |
234 | } | ||
235 | 2 | } | |
236 | #endif /* LV_HAVE_SSE2 */ | ||
237 | |||
238 | |||
239 | #ifdef LV_HAVE_SSE | ||
240 | #include <xmmintrin.h> | ||
241 | |||
242 | 2 | static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, | |
243 | const float* inputVector, | ||
244 | const float scalar, | ||
245 | unsigned int num_points) | ||
246 | { | ||
247 | 2 | unsigned int number = 0; | |
248 | |||
249 | 2 | const unsigned int quarterPoints = num_points / 4; | |
250 | |||
251 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
252 | 2 | int16_t* outputVectorPtr = outputVector; | |
253 | |||
254 | 2 | float min_val = SHRT_MIN; | |
255 | 2 | float max_val = SHRT_MAX; | |
256 | float r; | ||
257 | |||
258 | 2 | __m128 vScalar = _mm_set_ps1(scalar); | |
259 | __m128 ret; | ||
260 | 2 | __m128 vmin_val = _mm_set_ps1(min_val); | |
261 | 2 | __m128 vmax_val = _mm_set_ps1(max_val); | |
262 | |||
263 | __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; | ||
264 | |||
265 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
266 | 65534 | ret = _mm_loadu_ps(inputVectorPtr); | |
267 | 65534 | inputVectorPtr += 4; | |
268 | |||
269 | // Scale and clip | ||
270 | 196602 | ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); | |
271 | |||
272 | _mm_store_ps(outputFloatBuffer, ret); | ||
273 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); | |
274 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); | |
275 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); | |
276 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); | |
277 | } | ||
278 | |||
279 | 2 | number = quarterPoints * 4; | |
280 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
281 | 6 | r = inputVector[number] * scalar; | |
282 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (r > max_val) |
283 | ✗ | r = max_val; | |
284 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | else if (r < min_val) |
285 | ✗ | r = min_val; | |
286 | 6 | outputVector[number] = (int16_t)rintf(r); | |
287 | } | ||
288 | 2 | } | |
289 | #endif /* LV_HAVE_SSE */ | ||
290 | |||
291 | |||
292 | #ifdef LV_HAVE_GENERIC | ||
293 | |||
294 | 2 | static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, | |
295 | const float* inputVector, | ||
296 | const float scalar, | ||
297 | unsigned int num_points) | ||
298 | { | ||
299 | 2 | int16_t* outputVectorPtr = outputVector; | |
300 | 2 | const float* inputVectorPtr = inputVector; | |
301 | 2 | unsigned int number = 0; | |
302 | 2 | float min_val = SHRT_MIN; | |
303 | 2 | float max_val = SHRT_MAX; | |
304 | float r; | ||
305 | |||
306 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
307 | 262142 | r = *inputVectorPtr++ * scalar; | |
308 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.
|
262142 | if (r > max_val) |
309 | ✗ | r = max_val; | |
310 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.
|
262142 | else if (r < min_val) |
311 | ✗ | r = min_val; | |
312 | 262142 | *outputVectorPtr++ = (int16_t)rintf(r); | |
313 | } | ||
314 | 2 | } | |
315 | #endif /* LV_HAVE_GENERIC */ | ||
316 | |||
317 | |||
318 | #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */ | ||
319 | #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H | ||
320 | #define INCLUDED_volk_32f_s32f_convert_16i_a_H | ||
321 | |||
322 | #include <inttypes.h> | ||
323 | #include <math.h> | ||
324 | #include <stdio.h> | ||
325 | #include <volk/volk_common.h> | ||
326 | |||
327 | #ifdef LV_HAVE_AVX2 | ||
328 | #include <immintrin.h> | ||
329 | |||
330 | 2 | static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, | |
331 | const float* inputVector, | ||
332 | const float scalar, | ||
333 | unsigned int num_points) | ||
334 | { | ||
335 | 2 | unsigned int number = 0; | |
336 | |||
337 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
338 | |||
339 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
340 | 2 | int16_t* outputVectorPtr = outputVector; | |
341 | |||
342 | 2 | float min_val = SHRT_MIN; | |
343 | 2 | float max_val = SHRT_MAX; | |
344 | float r; | ||
345 | |||
346 | 2 | __m256 vScalar = _mm256_set1_ps(scalar); | |
347 | __m256 inputVal1, inputVal2; | ||
348 | __m256i intInputVal1, intInputVal2; | ||
349 | __m256 ret1, ret2; | ||
350 | 2 | __m256 vmin_val = _mm256_set1_ps(min_val); | |
351 | 2 | __m256 vmax_val = _mm256_set1_ps(max_val); | |
352 | |||
353 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
354 | 16382 | inputVal1 = _mm256_load_ps(inputVectorPtr); | |
355 | 16382 | inputVectorPtr += 8; | |
356 | 16382 | inputVal2 = _mm256_load_ps(inputVectorPtr); | |
357 | 16382 | inputVectorPtr += 8; | |
358 | |||
359 | // Scale and clip | ||
360 | 49146 | ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), | |
361 | vmin_val); | ||
362 | 49146 | ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), | |
363 | vmin_val); | ||
364 | |||
365 | 16382 | intInputVal1 = _mm256_cvtps_epi32(ret1); | |
366 | 16382 | intInputVal2 = _mm256_cvtps_epi32(ret2); | |
367 | |||
368 | 16382 | intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); | |
369 | 16382 | intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); | |
370 | |||
371 | _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); | ||
372 | 16382 | outputVectorPtr += 16; | |
373 | } | ||
374 | |||
375 | 2 | number = sixteenthPoints * 16; | |
376 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
377 | 30 | r = inputVector[number] * scalar; | |
378 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | if (r > max_val) |
379 | ✗ | r = max_val; | |
380 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | else if (r < min_val) |
381 | ✗ | r = min_val; | |
382 | 30 | outputVector[number] = (int16_t)rintf(r); | |
383 | } | ||
384 | 2 | } | |
385 | #endif /* LV_HAVE_AVX2 */ | ||
386 | |||
387 | |||
388 | #ifdef LV_HAVE_AVX | ||
389 | #include <immintrin.h> | ||
390 | |||
391 | 2 | static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, | |
392 | const float* inputVector, | ||
393 | const float scalar, | ||
394 | unsigned int num_points) | ||
395 | { | ||
396 | 2 | unsigned int number = 0; | |
397 | |||
398 | 2 | const unsigned int eighthPoints = num_points / 8; | |
399 | |||
400 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
401 | 2 | int16_t* outputVectorPtr = outputVector; | |
402 | |||
403 | 2 | float min_val = SHRT_MIN; | |
404 | 2 | float max_val = SHRT_MAX; | |
405 | float r; | ||
406 | |||
407 | 2 | __m256 vScalar = _mm256_set1_ps(scalar); | |
408 | __m256 inputVal, ret; | ||
409 | __m256i intInputVal; | ||
410 | __m128i intInputVal1, intInputVal2; | ||
411 | 2 | __m256 vmin_val = _mm256_set1_ps(min_val); | |
412 | 2 | __m256 vmax_val = _mm256_set1_ps(max_val); | |
413 | |||
414 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
415 | 32766 | inputVal = _mm256_load_ps(inputVectorPtr); | |
416 | 32766 | inputVectorPtr += 8; | |
417 | |||
418 | // Scale and clip | ||
419 | 98298 | ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), | |
420 | vmin_val); | ||
421 | |||
422 | 32766 | intInputVal = _mm256_cvtps_epi32(ret); | |
423 | |||
424 | 32766 | intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); | |
425 | 32766 | intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); | |
426 | |||
427 | 32766 | intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); | |
428 | |||
429 | _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); | ||
430 | 32766 | outputVectorPtr += 8; | |
431 | } | ||
432 | |||
433 | 2 | number = eighthPoints * 8; | |
434 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
435 | 14 | r = inputVector[number] * scalar; | |
436 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (r > max_val) |
437 | ✗ | r = max_val; | |
438 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | else if (r < min_val) |
439 | ✗ | r = min_val; | |
440 | 14 | outputVector[number] = (int16_t)rintf(r); | |
441 | } | ||
442 | 2 | } | |
443 | #endif /* LV_HAVE_AVX */ | ||
444 | |||
445 | #ifdef LV_HAVE_SSE2 | ||
446 | #include <emmintrin.h> | ||
447 | |||
448 | 2 | static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, | |
449 | const float* inputVector, | ||
450 | const float scalar, | ||
451 | unsigned int num_points) | ||
452 | { | ||
453 | 2 | unsigned int number = 0; | |
454 | |||
455 | 2 | const unsigned int eighthPoints = num_points / 8; | |
456 | |||
457 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
458 | 2 | int16_t* outputVectorPtr = outputVector; | |
459 | |||
460 | 2 | float min_val = SHRT_MIN; | |
461 | 2 | float max_val = SHRT_MAX; | |
462 | float r; | ||
463 | |||
464 | 2 | __m128 vScalar = _mm_set_ps1(scalar); | |
465 | __m128 inputVal1, inputVal2; | ||
466 | __m128i intInputVal1, intInputVal2; | ||
467 | __m128 ret1, ret2; | ||
468 | 2 | __m128 vmin_val = _mm_set_ps1(min_val); | |
469 | 2 | __m128 vmax_val = _mm_set_ps1(max_val); | |
470 | |||
471 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
472 | 32766 | inputVal1 = _mm_load_ps(inputVectorPtr); | |
473 | 32766 | inputVectorPtr += 4; | |
474 | 32766 | inputVal2 = _mm_load_ps(inputVectorPtr); | |
475 | 32766 | inputVectorPtr += 4; | |
476 | |||
477 | // Scale and clip | ||
478 | 98298 | ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); | |
479 | 98298 | ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); | |
480 | |||
481 | 32766 | intInputVal1 = _mm_cvtps_epi32(ret1); | |
482 | 32766 | intInputVal2 = _mm_cvtps_epi32(ret2); | |
483 | |||
484 | 32766 | intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); | |
485 | |||
486 | _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); | ||
487 | 32766 | outputVectorPtr += 8; | |
488 | } | ||
489 | |||
490 | 2 | number = eighthPoints * 8; | |
491 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
492 | 14 | r = inputVector[number] * scalar; | |
493 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (r > max_val) |
494 | ✗ | r = max_val; | |
495 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | else if (r < min_val) |
496 | ✗ | r = min_val; | |
497 | 14 | outputVector[number] = (int16_t)rintf(r); | |
498 | } | ||
499 | 2 | } | |
500 | #endif /* LV_HAVE_SSE2 */ | ||
501 | |||
502 | |||
503 | #ifdef LV_HAVE_SSE | ||
504 | #include <xmmintrin.h> | ||
505 | |||
506 | 2 | static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, | |
507 | const float* inputVector, | ||
508 | const float scalar, | ||
509 | unsigned int num_points) | ||
510 | { | ||
511 | 2 | unsigned int number = 0; | |
512 | |||
513 | 2 | const unsigned int quarterPoints = num_points / 4; | |
514 | |||
515 | 2 | const float* inputVectorPtr = (const float*)inputVector; | |
516 | 2 | int16_t* outputVectorPtr = outputVector; | |
517 | |||
518 | 2 | float min_val = SHRT_MIN; | |
519 | 2 | float max_val = SHRT_MAX; | |
520 | float r; | ||
521 | |||
522 | 2 | __m128 vScalar = _mm_set_ps1(scalar); | |
523 | __m128 ret; | ||
524 | 2 | __m128 vmin_val = _mm_set_ps1(min_val); | |
525 | 2 | __m128 vmax_val = _mm_set_ps1(max_val); | |
526 | |||
527 | __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; | ||
528 | |||
529 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
530 | 65534 | ret = _mm_load_ps(inputVectorPtr); | |
531 | 65534 | inputVectorPtr += 4; | |
532 | |||
533 | // Scale and clip | ||
534 | 196602 | ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); | |
535 | |||
536 | _mm_store_ps(outputFloatBuffer, ret); | ||
537 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); | |
538 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); | |
539 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); | |
540 | 65534 | *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); | |
541 | } | ||
542 | |||
543 | 2 | number = quarterPoints * 4; | |
544 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
545 | 6 | r = inputVector[number] * scalar; | |
546 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (r > max_val) |
547 | ✗ | r = max_val; | |
548 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | else if (r < min_val) |
549 | ✗ | r = min_val; | |
550 | 6 | outputVector[number] = (int16_t)rintf(r); | |
551 | } | ||
552 | 2 | } | |
553 | #endif /* LV_HAVE_SSE */ | ||
554 | |||
555 | |||
556 | #ifdef LV_HAVE_GENERIC | ||
557 | |||
558 | 2 | static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, | |
559 | const float* inputVector, | ||
560 | const float scalar, | ||
561 | unsigned int num_points) | ||
562 | { | ||
563 | 2 | int16_t* outputVectorPtr = outputVector; | |
564 | 2 | const float* inputVectorPtr = inputVector; | |
565 | 2 | unsigned int number = 0; | |
566 | 2 | float min_val = SHRT_MIN; | |
567 | 2 | float max_val = SHRT_MAX; | |
568 | float r; | ||
569 | |||
570 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
571 | 262142 | r = *inputVectorPtr++ * scalar; | |
572 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.
|
262142 | if (r < min_val) |
573 | ✗ | r = min_val; | |
574 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.
|
262142 | else if (r > max_val) |
575 | ✗ | r = max_val; | |
576 | 262142 | *outputVectorPtr++ = (int16_t)rintf(r); | |
577 | } | ||
578 | 2 | } | |
579 | #endif /* LV_HAVE_GENERIC */ | ||
580 | |||
581 | #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ | ||
582 |