Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2016 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_convert_16ic | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Converts a complex vector of 32-bits float each component into | ||
16 | * a complex vector of 16-bits integer each component. | ||
17 | * Values are saturated to the limit values of the output data type. | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, | ||
22 | * unsigned int num_points); \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li inputVector: The complex 32-bit float input data buffer. | ||
26 | * \li num_points: The number of data values to be converted. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li outputVector: The complex 16-bit integer output data buffer. | ||
30 | * | ||
31 | */ | ||
32 | |||
33 | #ifndef INCLUDED_volk_32fc_convert_16ic_a_H | ||
34 | #define INCLUDED_volk_32fc_convert_16ic_a_H | ||
35 | |||
36 | #include "volk/volk_complex.h" | ||
37 | #include <limits.h> | ||
38 | #include <math.h> | ||
39 | |||
40 | #ifdef LV_HAVE_AVX2 | ||
41 | #include <immintrin.h> | ||
42 | |||
43 | 2 | static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, | |
44 | const lv_32fc_t* inputVector, | ||
45 | unsigned int num_points) | ||
46 | { | ||
47 | 2 | const unsigned int avx_iters = num_points / 8; | |
48 | |||
49 | 2 | float* inputVectorPtr = (float*)inputVector; | |
50 | 2 | int16_t* outputVectorPtr = (int16_t*)outputVector; | |
51 | float aux; | ||
52 | |||
53 | 2 | const float min_val = (float)SHRT_MIN; | |
54 | 2 | const float max_val = (float)SHRT_MAX; | |
55 | |||
56 | __m256 inputVal1, inputVal2; | ||
57 | __m256i intInputVal1, intInputVal2; | ||
58 | __m256 ret1, ret2; | ||
59 | 2 | const __m256 vmin_val = _mm256_set1_ps(min_val); | |
60 | 2 | const __m256 vmax_val = _mm256_set1_ps(max_val); | |
61 | unsigned int i; | ||
62 | |||
63 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < avx_iters; i++) { |
64 | 32766 | inputVal1 = _mm256_load_ps((float*)inputVectorPtr); | |
65 | 32766 | inputVectorPtr += 8; | |
66 | 32766 | inputVal2 = _mm256_load_ps((float*)inputVectorPtr); | |
67 | 32766 | inputVectorPtr += 8; | |
68 | 32766 | __VOLK_PREFETCH(inputVectorPtr + 16); | |
69 | |||
70 | // Clip | ||
71 | 65532 | ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); | |
72 | 65532 | ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); | |
73 | |||
74 | 32766 | intInputVal1 = _mm256_cvtps_epi32(ret1); | |
75 | 32766 | intInputVal2 = _mm256_cvtps_epi32(ret2); | |
76 | |||
77 | 32766 | intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); | |
78 | 32766 | intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); | |
79 | |||
80 | _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); | ||
81 | 32766 | outputVectorPtr += 16; | |
82 | } | ||
83 | |||
84 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.
|
30 | for (i = avx_iters * 16; i < num_points * 2; i++) { |
85 | 28 | aux = *inputVectorPtr++; | |
86 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
|
28 | if (aux > max_val) |
87 | ✗ | aux = max_val; | |
88 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
|
28 | else if (aux < min_val) |
89 | ✗ | aux = min_val; | |
90 | 28 | *outputVectorPtr++ = (int16_t)rintf(aux); | |
91 | } | ||
92 | 2 | } | |
93 | #endif /* LV_HAVE_AVX2 */ | ||
94 | |||
95 | #ifdef LV_HAVE_SSE2 | ||
96 | #include <emmintrin.h> | ||
97 | |||
98 | 2 | static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, | |
99 | const lv_32fc_t* inputVector, | ||
100 | unsigned int num_points) | ||
101 | { | ||
102 | 2 | const unsigned int sse_iters = num_points / 4; | |
103 | |||
104 | 2 | float* inputVectorPtr = (float*)inputVector; | |
105 | 2 | int16_t* outputVectorPtr = (int16_t*)outputVector; | |
106 | float aux; | ||
107 | |||
108 | 2 | const float min_val = (float)SHRT_MIN; | |
109 | 2 | const float max_val = (float)SHRT_MAX; | |
110 | |||
111 | __m128 inputVal1, inputVal2; | ||
112 | __m128i intInputVal1, intInputVal2; | ||
113 | __m128 ret1, ret2; | ||
114 | 2 | const __m128 vmin_val = _mm_set_ps1(min_val); | |
115 | 2 | const __m128 vmax_val = _mm_set_ps1(max_val); | |
116 | unsigned int i; | ||
117 | |||
118 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (i = 0; i < sse_iters; i++) { |
119 | 65534 | inputVal1 = _mm_load_ps((float*)inputVectorPtr); | |
120 | 65534 | inputVectorPtr += 4; | |
121 | 65534 | inputVal2 = _mm_load_ps((float*)inputVectorPtr); | |
122 | 65534 | inputVectorPtr += 4; | |
123 | 65534 | __VOLK_PREFETCH(inputVectorPtr + 8); | |
124 | |||
125 | // Clip | ||
126 | 131068 | ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); | |
127 | 131068 | ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); | |
128 | |||
129 | 65534 | intInputVal1 = _mm_cvtps_epi32(ret1); | |
130 | 65534 | intInputVal2 = _mm_cvtps_epi32(ret2); | |
131 | |||
132 | 65534 | intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); | |
133 | |||
134 | _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); | ||
135 | 65534 | outputVectorPtr += 8; | |
136 | } | ||
137 | |||
138 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 2 times.
|
14 | for (i = sse_iters * 8; i < num_points * 2; i++) { |
139 | 12 | aux = *inputVectorPtr++; | |
140 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
|
12 | if (aux > max_val) |
141 | ✗ | aux = max_val; | |
142 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
|
12 | else if (aux < min_val) |
143 | ✗ | aux = min_val; | |
144 | 12 | *outputVectorPtr++ = (int16_t)rintf(aux); | |
145 | } | ||
146 | 2 | } | |
147 | #endif /* LV_HAVE_SSE2 */ | ||
148 | |||
149 | |||
150 | #if LV_HAVE_NEONV7 | ||
151 | #include <arm_neon.h> | ||
152 | |||
153 | #define VCVTRQ_S32_F32(result, value) \ | ||
154 | __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \ | ||
155 | __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \ | ||
156 | __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \ | ||
157 | __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :); | ||
158 | |||
159 | static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, | ||
160 | const lv_32fc_t* inputVector, | ||
161 | unsigned int num_points) | ||
162 | { | ||
163 | |||
164 | const unsigned int neon_iters = num_points / 4; | ||
165 | |||
166 | float32_t* inputVectorPtr = (float32_t*)inputVector; | ||
167 | int16_t* outputVectorPtr = (int16_t*)outputVector; | ||
168 | |||
169 | const float min_val_f = (float)SHRT_MIN; | ||
170 | const float max_val_f = (float)SHRT_MAX; | ||
171 | float32_t aux; | ||
172 | unsigned int i; | ||
173 | |||
174 | const float32x4_t min_val = vmovq_n_f32(min_val_f); | ||
175 | const float32x4_t max_val = vmovq_n_f32(max_val_f); | ||
176 | float32x4_t ret1, ret2, a, b; | ||
177 | |||
178 | int32x4_t toint_a = { 0, 0, 0, 0 }; | ||
179 | int32x4_t toint_b = { 0, 0, 0, 0 }; | ||
180 | int16x4_t intInputVal1, intInputVal2; | ||
181 | int16x8_t res; | ||
182 | |||
183 | for (i = 0; i < neon_iters; i++) { | ||
184 | a = vld1q_f32((const float32_t*)(inputVectorPtr)); | ||
185 | inputVectorPtr += 4; | ||
186 | b = vld1q_f32((const float32_t*)(inputVectorPtr)); | ||
187 | inputVectorPtr += 4; | ||
188 | __VOLK_PREFETCH(inputVectorPtr + 8); | ||
189 | |||
190 | ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); | ||
191 | ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); | ||
192 | |||
193 | // vcvtr takes into account the current rounding mode (as does rintf) | ||
194 | VCVTRQ_S32_F32(toint_a, ret1); | ||
195 | VCVTRQ_S32_F32(toint_b, ret2); | ||
196 | |||
197 | intInputVal1 = vqmovn_s32(toint_a); | ||
198 | intInputVal2 = vqmovn_s32(toint_b); | ||
199 | |||
200 | res = vcombine_s16(intInputVal1, intInputVal2); | ||
201 | vst1q_s16((int16_t*)outputVectorPtr, res); | ||
202 | outputVectorPtr += 8; | ||
203 | } | ||
204 | |||
205 | for (i = neon_iters * 8; i < num_points * 2; i++) { | ||
206 | aux = *inputVectorPtr++; | ||
207 | if (aux > max_val_f) | ||
208 | aux = max_val_f; | ||
209 | else if (aux < min_val_f) | ||
210 | aux = min_val_f; | ||
211 | *outputVectorPtr++ = (int16_t)rintf(aux); | ||
212 | } | ||
213 | } | ||
214 | |||
215 | #undef VCVTRQ_S32_F32 | ||
216 | #endif /* LV_HAVE_NEONV7 */ | ||
217 | |||
218 | #if LV_HAVE_NEONV8 | ||
219 | #include <arm_neon.h> | ||
220 | |||
221 | static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, | ||
222 | const lv_32fc_t* inputVector, | ||
223 | unsigned int num_points) | ||
224 | { | ||
225 | const unsigned int neon_iters = num_points / 4; | ||
226 | |||
227 | float32_t* inputVectorPtr = (float32_t*)inputVector; | ||
228 | int16_t* outputVectorPtr = (int16_t*)outputVector; | ||
229 | |||
230 | const float min_val_f = (float)SHRT_MIN; | ||
231 | const float max_val_f = (float)SHRT_MAX; | ||
232 | float32_t aux; | ||
233 | unsigned int i; | ||
234 | |||
235 | const float32x4_t min_val = vmovq_n_f32(min_val_f); | ||
236 | const float32x4_t max_val = vmovq_n_f32(max_val_f); | ||
237 | float32x4_t ret1, ret2, a, b; | ||
238 | |||
239 | int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 }; | ||
240 | int16x4_t intInputVal1, intInputVal2; | ||
241 | int16x8_t res; | ||
242 | |||
243 | for (i = 0; i < neon_iters; i++) { | ||
244 | a = vld1q_f32((const float32_t*)(inputVectorPtr)); | ||
245 | inputVectorPtr += 4; | ||
246 | b = vld1q_f32((const float32_t*)(inputVectorPtr)); | ||
247 | inputVectorPtr += 4; | ||
248 | __VOLK_PREFETCH(inputVectorPtr + 8); | ||
249 | |||
250 | ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); | ||
251 | ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); | ||
252 | |||
253 | // vrndiq takes into account the current rounding mode (as does rintf) | ||
254 | toint_a = vcvtq_s32_f32(vrndiq_f32(ret1)); | ||
255 | toint_b = vcvtq_s32_f32(vrndiq_f32(ret2)); | ||
256 | |||
257 | intInputVal1 = vqmovn_s32(toint_a); | ||
258 | intInputVal2 = vqmovn_s32(toint_b); | ||
259 | |||
260 | res = vcombine_s16(intInputVal1, intInputVal2); | ||
261 | vst1q_s16((int16_t*)outputVectorPtr, res); | ||
262 | outputVectorPtr += 8; | ||
263 | } | ||
264 | |||
265 | for (i = neon_iters * 8; i < num_points * 2; i++) { | ||
266 | aux = *inputVectorPtr++; | ||
267 | if (aux > max_val_f) | ||
268 | aux = max_val_f; | ||
269 | else if (aux < min_val_f) | ||
270 | aux = min_val_f; | ||
271 | *outputVectorPtr++ = (int16_t)rintf(aux); | ||
272 | } | ||
273 | } | ||
274 | #endif /* LV_HAVE_NEONV8 */ | ||
275 | |||
276 | |||
277 | #ifdef LV_HAVE_GENERIC | ||
278 | |||
279 | 2 | static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, | |
280 | const lv_32fc_t* inputVector, | ||
281 | unsigned int num_points) | ||
282 | { | ||
283 | 2 | float* inputVectorPtr = (float*)inputVector; | |
284 | 2 | int16_t* outputVectorPtr = (int16_t*)outputVector; | |
285 | 2 | const float min_val = (float)SHRT_MIN; | |
286 | 2 | const float max_val = (float)SHRT_MAX; | |
287 | float aux; | ||
288 | unsigned int i; | ||
289 |
2/2✓ Branch 0 taken 524284 times.
✓ Branch 1 taken 2 times.
|
524286 | for (i = 0; i < num_points * 2; i++) { |
290 | 524284 | aux = *inputVectorPtr++; | |
291 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 524284 times.
|
524284 | if (aux > max_val) |
292 | ✗ | aux = max_val; | |
293 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 524284 times.
|
524284 | else if (aux < min_val) |
294 | ✗ | aux = min_val; | |
295 | 524284 | *outputVectorPtr++ = (int16_t)rintf(aux); | |
296 | } | ||
297 | 2 | } | |
298 | #endif /* LV_HAVE_GENERIC */ | ||
299 | |||
300 | #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */ | ||
301 | |||
302 | #ifndef INCLUDED_volk_32fc_convert_16ic_u_H | ||
303 | #define INCLUDED_volk_32fc_convert_16ic_u_H | ||
304 | |||
305 | #include "volk/volk_complex.h" | ||
306 | #include <limits.h> | ||
307 | #include <math.h> | ||
308 | |||
309 | |||
310 | #ifdef LV_HAVE_AVX2 | ||
311 | #include <immintrin.h> | ||
312 | |||
313 | 2 | static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, | |
314 | const lv_32fc_t* inputVector, | ||
315 | unsigned int num_points) | ||
316 | { | ||
317 | 2 | const unsigned int avx_iters = num_points / 8; | |
318 | |||
319 | 2 | float* inputVectorPtr = (float*)inputVector; | |
320 | 2 | int16_t* outputVectorPtr = (int16_t*)outputVector; | |
321 | float aux; | ||
322 | |||
323 | 2 | const float min_val = (float)SHRT_MIN; | |
324 | 2 | const float max_val = (float)SHRT_MAX; | |
325 | |||
326 | __m256 inputVal1, inputVal2; | ||
327 | __m256i intInputVal1, intInputVal2; | ||
328 | __m256 ret1, ret2; | ||
329 | 2 | const __m256 vmin_val = _mm256_set1_ps(min_val); | |
330 | 2 | const __m256 vmax_val = _mm256_set1_ps(max_val); | |
331 | unsigned int i; | ||
332 | |||
333 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < avx_iters; i++) { |
334 | 32766 | inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); | |
335 | 32766 | inputVectorPtr += 8; | |
336 | 32766 | inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); | |
337 | 32766 | inputVectorPtr += 8; | |
338 | 32766 | __VOLK_PREFETCH(inputVectorPtr + 16); | |
339 | |||
340 | // Clip | ||
341 | 65532 | ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); | |
342 | 65532 | ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); | |
343 | |||
344 | 32766 | intInputVal1 = _mm256_cvtps_epi32(ret1); | |
345 | 32766 | intInputVal2 = _mm256_cvtps_epi32(ret2); | |
346 | |||
347 | 32766 | intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); | |
348 | 32766 | intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); | |
349 | |||
350 | _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); | ||
351 | 32766 | outputVectorPtr += 16; | |
352 | } | ||
353 | |||
354 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.
|
30 | for (i = avx_iters * 16; i < num_points * 2; i++) { |
355 | 28 | aux = *inputVectorPtr++; | |
356 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
|
28 | if (aux > max_val) |
357 | ✗ | aux = max_val; | |
358 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
|
28 | else if (aux < min_val) |
359 | ✗ | aux = min_val; | |
360 | 28 | *outputVectorPtr++ = (int16_t)rintf(aux); | |
361 | } | ||
362 | 2 | } | |
363 | #endif /* LV_HAVE_AVX2 */ | ||
364 | |||
365 | |||
366 | #ifdef LV_HAVE_SSE2 | ||
367 | #include <emmintrin.h> | ||
368 | |||
369 | 2 | static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, | |
370 | const lv_32fc_t* inputVector, | ||
371 | unsigned int num_points) | ||
372 | { | ||
373 | 2 | const unsigned int sse_iters = num_points / 4; | |
374 | |||
375 | 2 | float* inputVectorPtr = (float*)inputVector; | |
376 | 2 | int16_t* outputVectorPtr = (int16_t*)outputVector; | |
377 | float aux; | ||
378 | |||
379 | 2 | const float min_val = (float)SHRT_MIN; | |
380 | 2 | const float max_val = (float)SHRT_MAX; | |
381 | |||
382 | __m128 inputVal1, inputVal2; | ||
383 | __m128i intInputVal1, intInputVal2; | ||
384 | __m128 ret1, ret2; | ||
385 | 2 | const __m128 vmin_val = _mm_set_ps1(min_val); | |
386 | 2 | const __m128 vmax_val = _mm_set_ps1(max_val); | |
387 | |||
388 | unsigned int i; | ||
389 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (i = 0; i < sse_iters; i++) { |
390 | 65534 | inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); | |
391 | 65534 | inputVectorPtr += 4; | |
392 | 65534 | inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); | |
393 | 65534 | inputVectorPtr += 4; | |
394 | 65534 | __VOLK_PREFETCH(inputVectorPtr + 8); | |
395 | |||
396 | // Clip | ||
397 | 131068 | ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); | |
398 | 131068 | ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); | |
399 | |||
400 | 65534 | intInputVal1 = _mm_cvtps_epi32(ret1); | |
401 | 65534 | intInputVal2 = _mm_cvtps_epi32(ret2); | |
402 | |||
403 | 65534 | intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); | |
404 | |||
405 | _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); | ||
406 | 65534 | outputVectorPtr += 8; | |
407 | } | ||
408 | |||
409 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 2 times.
|
14 | for (i = sse_iters * 8; i < num_points * 2; i++) { |
410 | 12 | aux = *inputVectorPtr++; | |
411 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
|
12 | if (aux > max_val) |
412 | ✗ | aux = max_val; | |
413 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
|
12 | else if (aux < min_val) |
414 | ✗ | aux = min_val; | |
415 | 12 | *outputVectorPtr++ = (int16_t)rintf(aux); | |
416 | } | ||
417 | 2 | } | |
418 | #endif /* LV_HAVE_SSE2 */ | ||
419 | #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ | ||
420 |