Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2021 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_index_min_16u | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the | ||
16 | * minimum magnitude for complex points in the given vector. | ||
17 | * | ||
18 | * Note that num_points is a uint32_t, but the return value is | ||
19 | * uint16_t. Providing a vector larger than the max of a uint16_t | ||
20 | * (65536) would miss anything outside of this boundary. The kernel | ||
21 | * will check the length of num_points and cap it to this max value, | ||
22 | * anyways. | ||
23 | * | ||
24 | * <b>Dispatcher Prototype</b> | ||
25 | * \code | ||
26 | * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t | ||
27 | * num_points) \endcode | ||
28 | * | ||
29 | * \b Inputs | ||
30 | * \li source: The complex input vector. | ||
31 | * \li num_points: The number of samples. | ||
32 | * | ||
33 | * \b Outputs | ||
34 | * \li target: The index of the point with minimum magnitude. | ||
35 | * | ||
36 | * \b Example | ||
37 | * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around | ||
38 | * the unit circle. | ||
39 | * \code | ||
40 | * int N = 10; | ||
41 | * uint32_t alignment = volk_get_alignment(); | ||
42 | * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
43 | * uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); | ||
44 | * | ||
45 | * for(uint32_t ii = 0; ii < N/2; ++ii){ | ||
46 | * float real = 2.f * ((float)ii / (float)N) - 1.f; | ||
47 | * float imag = std::sqrt(1.f - real * real); | ||
48 | * in[ii] = lv_cmake(real, imag); | ||
49 | * in[ii] = in[ii] * in[ii] + in[ii]; | ||
50 | * in[N-ii] = lv_cmake(real, imag); | ||
51 | * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii]; | ||
52 | * } | ||
53 | * | ||
54 | * volk_32fc_index_min_16u(min, in, N); | ||
55 | * | ||
56 | * printf("index of min value = %u\n", *min); | ||
57 | * | ||
58 | * volk_free(in); | ||
59 | * volk_free(min); | ||
60 | * \endcode | ||
61 | */ | ||
62 | |||
63 | #ifndef INCLUDED_volk_32fc_index_min_16u_a_H | ||
64 | #define INCLUDED_volk_32fc_index_min_16u_a_H | ||
65 | |||
66 | #include <inttypes.h> | ||
67 | #include <limits.h> | ||
68 | #include <stdio.h> | ||
69 | #include <volk/volk_common.h> | ||
70 | #include <volk/volk_complex.h> | ||
71 | |||
72 | #ifdef LV_HAVE_AVX2 | ||
73 | #include <immintrin.h> | ||
74 | #include <volk/volk_avx2_intrinsics.h> | ||
75 | |||
76 | 2 | static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, | |
77 | const lv_32fc_t* source, | ||
78 | uint32_t num_points) | ||
79 | { | ||
80 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
81 | |||
82 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
83 | /* | ||
84 | * At the start of each loop iteration current_indices holds the indices of | ||
85 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
86 | * in implementation of vector_32fc_index_min_variant0(). | ||
87 | */ | ||
88 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
89 | |||
90 | 2 | __m256 min_values = _mm256_set1_ps(FLT_MAX); | |
91 | 2 | __m256i min_indices = _mm256_setzero_si256(); | |
92 | |||
93 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
94 | 16382 | __m256 in0 = _mm256_load_ps((float*)source); | |
95 | 16382 | __m256 in1 = _mm256_load_ps((float*)(source + 4)); | |
96 | 16382 | vector_32fc_index_min_variant0( | |
97 | in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); | ||
98 | 16382 | source += 8; | |
99 | } | ||
100 | |||
101 | // determine minimum value and index in the result of the vectorized loop | ||
102 | __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; | ||
103 | __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; | ||
104 | 2 | _mm256_store_ps(min_values_buffer, min_values); | |
105 | 2 | _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); | |
106 | |||
107 | 2 | float min = FLT_MAX; | |
108 | 2 | uint32_t index = 0; | |
109 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
110 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
|
16 | if (min_values_buffer[i] < min) { |
111 | 4 | min = min_values_buffer[i]; | |
112 | 4 | index = min_indices_buffer[i]; | |
113 | } | ||
114 | } | ||
115 | |||
116 | // handle tail not processed by the vectorized loop | ||
117 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
118 | 14 | const float abs_squared = | |
119 | 14 | lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); | |
120 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared < min) { |
121 | ✗ | min = abs_squared; | |
122 | ✗ | index = i; | |
123 | } | ||
124 | 14 | ++source; | |
125 | } | ||
126 | |||
127 | 2 | *target = index; | |
128 | 2 | } | |
129 | |||
130 | #endif /*LV_HAVE_AVX2*/ | ||
131 | |||
132 | #ifdef LV_HAVE_AVX2 | ||
133 | #include <immintrin.h> | ||
134 | #include <volk/volk_avx2_intrinsics.h> | ||
135 | |||
136 | 2 | static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, | |
137 | const lv_32fc_t* source, | ||
138 | uint32_t num_points) | ||
139 | { | ||
140 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
141 | |||
142 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
143 | /* | ||
144 | * At the start of each loop iteration current_indices holds the indices of | ||
145 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
146 | * in implementation of vector_32fc_index_min_variant0(). | ||
147 | */ | ||
148 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
149 | |||
150 | 2 | __m256 min_values = _mm256_set1_ps(FLT_MAX); | |
151 | 2 | __m256i min_indices = _mm256_setzero_si256(); | |
152 | |||
153 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
154 | 16382 | __m256 in0 = _mm256_load_ps((float*)source); | |
155 | 16382 | __m256 in1 = _mm256_load_ps((float*)(source + 4)); | |
156 | 16382 | vector_32fc_index_min_variant1( | |
157 | in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); | ||
158 | 16382 | source += 8; | |
159 | } | ||
160 | |||
161 | // determine minimum value and index in the result of the vectorized loop | ||
162 | __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; | ||
163 | __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; | ||
164 | 2 | _mm256_store_ps(min_values_buffer, min_values); | |
165 | 2 | _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); | |
166 | |||
167 | 2 | float min = FLT_MAX; | |
168 | 2 | uint32_t index = 0; | |
169 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
170 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
|
16 | if (min_values_buffer[i] < min) { |
171 | 4 | min = min_values_buffer[i]; | |
172 | 4 | index = min_indices_buffer[i]; | |
173 | } | ||
174 | } | ||
175 | |||
176 | // handle tail not processed by the vectorized loop | ||
177 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
178 | 14 | const float abs_squared = | |
179 | 14 | lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); | |
180 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared < min) { |
181 | ✗ | min = abs_squared; | |
182 | ✗ | index = i; | |
183 | } | ||
184 | 14 | ++source; | |
185 | } | ||
186 | |||
187 | 2 | *target = index; | |
188 | 2 | } | |
189 | |||
190 | #endif /*LV_HAVE_AVX2*/ | ||
191 | |||
192 | #ifdef LV_HAVE_SSE3 | ||
193 | #include <pmmintrin.h> | ||
194 | #include <xmmintrin.h> | ||
195 | |||
196 | 2 | static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target, | |
197 | const lv_32fc_t* source, | ||
198 | uint32_t num_points) | ||
199 | { | ||
200 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
201 | |||
202 | union bit128 holderf; | ||
203 | union bit128 holderi; | ||
204 | 2 | float sq_dist = 0.0; | |
205 | |||
206 | union bit128 xmm5, xmm4; | ||
207 | __m128 xmm1, xmm2, xmm3; | ||
208 | __m128i xmm8, xmm11, xmm12, xmm9, xmm10; | ||
209 | |||
210 | 2 | xmm5.int_vec = _mm_setzero_si128(); | |
211 | 2 | xmm4.int_vec = _mm_setzero_si128(); | |
212 | 2 | holderf.int_vec = _mm_setzero_si128(); | |
213 | 2 | holderi.int_vec = _mm_setzero_si128(); | |
214 | |||
215 | 2 | xmm8 = _mm_setr_epi32(0, 1, 2, 3); | |
216 | 2 | xmm9 = _mm_setzero_si128(); | |
217 | 2 | xmm10 = _mm_setr_epi32(4, 4, 4, 4); | |
218 | 2 | xmm3 = _mm_set_ps1(FLT_MAX); | |
219 | |||
220 | 2 | int bound = num_points >> 2; | |
221 | |||
222 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (int i = 0; i < bound; ++i) { |
223 | 32766 | xmm1 = _mm_load_ps((float*)source); | |
224 | 32766 | xmm2 = _mm_load_ps((float*)&source[2]); | |
225 | |||
226 | 32766 | source += 4; | |
227 | |||
228 | 65532 | xmm1 = _mm_mul_ps(xmm1, xmm1); | |
229 | 32766 | xmm2 = _mm_mul_ps(xmm2, xmm2); | |
230 | |||
231 | 32766 | xmm1 = _mm_hadd_ps(xmm1, xmm2); | |
232 | |||
233 | 32766 | xmm3 = _mm_min_ps(xmm1, xmm3); | |
234 | |||
235 | 32766 | xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); | |
236 | 32766 | xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); | |
237 | |||
238 | 32766 | xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); | |
239 | 65532 | xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); | |
240 | |||
241 | 32766 | xmm9 = _mm_add_epi32(xmm11, xmm12); | |
242 | |||
243 | 65532 | xmm8 = _mm_add_epi32(xmm8, xmm10); | |
244 | } | ||
245 | |||
246 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points >> 1 & 1) { |
247 | 2 | xmm2 = _mm_load_ps((float*)source); | |
248 | |||
249 | 2 | xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); | |
250 | 2 | xmm8 = bit128_p(&xmm1)->int_vec; | |
251 | |||
252 | 2 | xmm2 = _mm_mul_ps(xmm2, xmm2); | |
253 | |||
254 | 2 | source += 2; | |
255 | |||
256 | 2 | xmm1 = _mm_hadd_ps(xmm2, xmm2); | |
257 | |||
258 | 4 | xmm3 = _mm_min_ps(xmm1, xmm3); | |
259 | |||
260 | 2 | xmm10 = _mm_setr_epi32(2, 2, 2, 2); | |
261 | |||
262 | 2 | xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); | |
263 | 2 | xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); | |
264 | |||
265 | 2 | xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); | |
266 | 4 | xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); | |
267 | |||
268 | 2 | xmm9 = _mm_add_epi32(xmm11, xmm12); | |
269 | |||
270 | 4 | xmm8 = _mm_add_epi32(xmm8, xmm10); | |
271 | } | ||
272 | |||
273 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1) { |
274 | 2 | sq_dist = lv_creal(source[0]) * lv_creal(source[0]) + | |
275 | 2 | lv_cimag(source[0]) * lv_cimag(source[0]); | |
276 | |||
277 | 2 | xmm2 = _mm_load1_ps(&sq_dist); | |
278 | |||
279 | 2 | xmm1 = xmm3; | |
280 | |||
281 | 2 | xmm3 = _mm_min_ss(xmm3, xmm2); | |
282 | |||
283 | 2 | xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); | |
284 | 2 | xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); | |
285 | |||
286 | 2 | xmm8 = _mm_shuffle_epi32(xmm8, 0x00); | |
287 | |||
288 | 2 | xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); | |
289 | 4 | xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); | |
290 | |||
291 | 2 | xmm9 = _mm_add_epi32(xmm11, xmm12); | |
292 | } | ||
293 | |||
294 | _mm_store_ps((float*)&(holderf.f), xmm3); | ||
295 | _mm_store_si128(&(holderi.int_vec), xmm9); | ||
296 | |||
297 | 2 | target[0] = holderi.i[0]; | |
298 | 2 | sq_dist = holderf.f[0]; | |
299 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0]; |
300 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist; |
301 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0]; |
302 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist; |
303 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0]; |
304 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist; |
305 | 2 | } | |
306 | |||
307 | #endif /*LV_HAVE_SSE3*/ | ||
308 | |||
309 | #ifdef LV_HAVE_GENERIC | ||
310 | 2 | static inline void volk_32fc_index_min_16u_generic(uint16_t* target, | |
311 | const lv_32fc_t* source, | ||
312 | uint32_t num_points) | ||
313 | { | ||
314 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
315 | |||
316 | 2 | float sq_dist = 0.0; | |
317 | 2 | float min = FLT_MAX; | |
318 | 2 | uint16_t index = 0; | |
319 | |||
320 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (uint32_t i = 0; i < num_points; ++i) { |
321 | 131070 | sq_dist = lv_creal(source[i]) * lv_creal(source[i]) + | |
322 | 131070 | lv_cimag(source[i]) * lv_cimag(source[i]); | |
323 | |||
324 |
2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 131052 times.
|
131070 | if (sq_dist < min) { |
325 | 18 | index = i; | |
326 | 18 | min = sq_dist; | |
327 | } | ||
328 | } | ||
329 | 2 | target[0] = index; | |
330 | 2 | } | |
331 | |||
332 | #endif /*LV_HAVE_GENERIC*/ | ||
333 | |||
334 | #endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/ | ||
335 | |||
336 | #ifndef INCLUDED_volk_32fc_index_min_16u_u_H | ||
337 | #define INCLUDED_volk_32fc_index_min_16u_u_H | ||
338 | |||
339 | #include <inttypes.h> | ||
340 | #include <limits.h> | ||
341 | #include <stdio.h> | ||
342 | #include <volk/volk_common.h> | ||
343 | #include <volk/volk_complex.h> | ||
344 | |||
345 | #ifdef LV_HAVE_AVX2 | ||
346 | #include <immintrin.h> | ||
347 | #include <volk/volk_avx2_intrinsics.h> | ||
348 | |||
349 | 2 | static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, | |
350 | const lv_32fc_t* source, | ||
351 | uint32_t num_points) | ||
352 | { | ||
353 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
354 | |||
355 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
356 | /* | ||
357 | * At the start of each loop iteration current_indices holds the indices of | ||
358 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
359 | * in implementation of vector_32fc_index_min_variant0(). | ||
360 | */ | ||
361 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
362 | |||
363 | 2 | __m256 min_values = _mm256_set1_ps(FLT_MAX); | |
364 | 2 | __m256i min_indices = _mm256_setzero_si256(); | |
365 | |||
366 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
367 | 16382 | __m256 in0 = _mm256_loadu_ps((float*)source); | |
368 | 16382 | __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); | |
369 | 16382 | vector_32fc_index_min_variant0( | |
370 | in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); | ||
371 | 16382 | source += 8; | |
372 | } | ||
373 | |||
374 | // determine minimum value and index in the result of the vectorized loop | ||
375 | __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; | ||
376 | __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; | ||
377 | 2 | _mm256_store_ps(min_values_buffer, min_values); | |
378 | 2 | _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); | |
379 | |||
380 | 2 | float min = FLT_MAX; | |
381 | 2 | uint32_t index = 0; | |
382 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
383 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
|
16 | if (min_values_buffer[i] < min) { |
384 | 4 | min = min_values_buffer[i]; | |
385 | 4 | index = min_indices_buffer[i]; | |
386 | } | ||
387 | } | ||
388 | |||
389 | // handle tail not processed by the vectorized loop | ||
390 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
391 | 14 | const float abs_squared = | |
392 | 14 | lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); | |
393 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared < min) { |
394 | ✗ | min = abs_squared; | |
395 | ✗ | index = i; | |
396 | } | ||
397 | 14 | ++source; | |
398 | } | ||
399 | |||
400 | 2 | *target = index; | |
401 | 2 | } | |
402 | |||
403 | #endif /*LV_HAVE_AVX2*/ | ||
404 | |||
405 | #ifdef LV_HAVE_AVX2 | ||
406 | #include <immintrin.h> | ||
407 | #include <volk/volk_avx2_intrinsics.h> | ||
408 | |||
409 | 2 | static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, | |
410 | const lv_32fc_t* source, | ||
411 | uint32_t num_points) | ||
412 | { | ||
413 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
414 | |||
415 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
416 | /* | ||
417 | * At the start of each loop iteration current_indices holds the indices of | ||
418 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
419 | * in implementation of vector_32fc_index_min_variant0(). | ||
420 | */ | ||
421 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
422 | |||
423 | 2 | __m256 min_values = _mm256_set1_ps(FLT_MAX); | |
424 | 2 | __m256i min_indices = _mm256_setzero_si256(); | |
425 | |||
426 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
427 | 16382 | __m256 in0 = _mm256_loadu_ps((float*)source); | |
428 | 16382 | __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); | |
429 | 16382 | vector_32fc_index_min_variant1( | |
430 | in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); | ||
431 | 16382 | source += 8; | |
432 | } | ||
433 | |||
434 | // determine minimum value and index in the result of the vectorized loop | ||
435 | __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; | ||
436 | __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; | ||
437 | 2 | _mm256_store_ps(min_values_buffer, min_values); | |
438 | 2 | _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); | |
439 | |||
440 | 2 | float min = FLT_MAX; | |
441 | 2 | uint32_t index = 0; | |
442 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
443 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
|
16 | if (min_values_buffer[i] < min) { |
444 | 4 | min = min_values_buffer[i]; | |
445 | 4 | index = min_indices_buffer[i]; | |
446 | } | ||
447 | } | ||
448 | |||
449 | // handle tail not processed by the vectorized loop | ||
450 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
451 | 14 | const float abs_squared = | |
452 | 14 | lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); | |
453 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared < min) { |
454 | ✗ | min = abs_squared; | |
455 | ✗ | index = i; | |
456 | } | ||
457 | 14 | ++source; | |
458 | } | ||
459 | |||
460 | 2 | *target = index; | |
461 | 2 | } | |
462 | |||
463 | #endif /*LV_HAVE_AVX2*/ | ||
464 | |||
465 | #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/ | ||
466 |