Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_index_max_16u | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the | ||
16 | * maximum magnitude for complex points in the given vector. | ||
17 | * | ||
18 | * Note that num_points is a uint32_t, but the return value is | ||
19 | * uint16_t. Providing a vector larger than the max of a uint16_t | ||
20 | * (65536) would miss anything outside of this boundary. The kernel | ||
21 | * will check the length of num_points and cap it to this max value, | ||
22 | * anyways. | ||
23 | * | ||
24 | * <b>Dispatcher Prototype</b> | ||
25 | * \code | ||
26 | * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t | ||
27 | * num_points) \endcode | ||
28 | * | ||
29 | * \b Inputs | ||
30 | * \li src0: The complex input vector. | ||
31 | * \li num_points: The number of samples. | ||
32 | * | ||
33 | * \b Outputs | ||
34 | * \li target: The index of the point with maximum magnitude. | ||
35 | * | ||
36 | * \b Example | ||
37 | * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around | ||
38 | * the unit circle. | ||
39 | * \code | ||
40 | * int N = 10; | ||
41 | * uint32_t alignment = volk_get_alignment(); | ||
42 | * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
43 | * uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); | ||
44 | * | ||
45 | * for(uint32_t ii = 0; ii < N/2; ++ii){ | ||
46 | * float real = 2.f * ((float)ii / (float)N) - 1.f; | ||
47 | * float imag = std::sqrt(1.f - real * real); | ||
48 | * in[ii] = lv_cmake(real, imag); | ||
49 | * in[ii] = in[ii] * in[ii] + in[ii]; | ||
50 | * in[N-ii] = lv_cmake(real, imag); | ||
51 | * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii]; | ||
52 | * } | ||
53 | * | ||
54 | * volk_32fc_index_max_16u(max, in, N); | ||
55 | * | ||
56 | * printf("index of max value = %u\n", *max); | ||
57 | * | ||
58 | * volk_free(in); | ||
59 | * volk_free(max); | ||
60 | * \endcode | ||
61 | */ | ||
62 | |||
63 | #ifndef INCLUDED_volk_32fc_index_max_16u_a_H | ||
64 | #define INCLUDED_volk_32fc_index_max_16u_a_H | ||
65 | |||
66 | #include <inttypes.h> | ||
67 | #include <limits.h> | ||
68 | #include <stdio.h> | ||
69 | #include <volk/volk_common.h> | ||
70 | #include <volk/volk_complex.h> | ||
71 | |||
72 | #ifdef LV_HAVE_AVX2 | ||
73 | #include <immintrin.h> | ||
74 | #include <volk/volk_avx2_intrinsics.h> | ||
75 | |||
76 | 2 | static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target, | |
77 | lv_32fc_t* src0, | ||
78 | uint32_t num_points) | ||
79 | { | ||
80 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
81 | |||
82 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
83 | /* | ||
84 | * At the start of each loop iteration current_indices holds the indices of | ||
85 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
86 | * in implementation of vector_32fc_index_max_variant0(). | ||
87 | */ | ||
88 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
89 | |||
90 | 2 | __m256 max_values = _mm256_setzero_ps(); | |
91 | 2 | __m256i max_indices = _mm256_setzero_si256(); | |
92 | |||
93 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
94 | 16382 | __m256 in0 = _mm256_load_ps((float*)src0); | |
95 | 16382 | __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); | |
96 | 16382 | vector_32fc_index_max_variant0( | |
97 | in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); | ||
98 | 16382 | src0 += 8; | |
99 | } | ||
100 | |||
101 | // determine maximum value and index in the result of the vectorized loop | ||
102 | __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; | ||
103 | __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; | ||
104 | 2 | _mm256_store_ps(max_values_buffer, max_values); | |
105 | 2 | _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); | |
106 | |||
107 | 2 | float max = 0.f; | |
108 | 2 | uint32_t index = 0; | |
109 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
110 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (max_values_buffer[i] > max) { |
111 | 8 | max = max_values_buffer[i]; | |
112 | 8 | index = max_indices_buffer[i]; | |
113 | } | ||
114 | } | ||
115 | |||
116 | // handle tail not processed by the vectorized loop | ||
117 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
118 | 14 | const float abs_squared = | |
119 | 14 | lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); | |
120 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared > max) { |
121 | ✗ | max = abs_squared; | |
122 | ✗ | index = i; | |
123 | } | ||
124 | 14 | ++src0; | |
125 | } | ||
126 | |||
127 | 2 | *target = index; | |
128 | 2 | } | |
129 | |||
130 | #endif /*LV_HAVE_AVX2*/ | ||
131 | |||
132 | #ifdef LV_HAVE_AVX2 | ||
133 | #include <immintrin.h> | ||
134 | #include <volk/volk_avx2_intrinsics.h> | ||
135 | |||
136 | 2 | static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target, | |
137 | lv_32fc_t* src0, | ||
138 | uint32_t num_points) | ||
139 | { | ||
140 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
141 | |||
142 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
143 | /* | ||
144 | * At the start of each loop iteration current_indices holds the indices of | ||
145 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
146 | * in implementation of vector_32fc_index_max_variant0(). | ||
147 | */ | ||
148 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
149 | |||
150 | 2 | __m256 max_values = _mm256_setzero_ps(); | |
151 | 2 | __m256i max_indices = _mm256_setzero_si256(); | |
152 | |||
153 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
154 | 16382 | __m256 in0 = _mm256_load_ps((float*)src0); | |
155 | 16382 | __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); | |
156 | 16382 | vector_32fc_index_max_variant1( | |
157 | in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); | ||
158 | 16382 | src0 += 8; | |
159 | } | ||
160 | |||
161 | // determine maximum value and index in the result of the vectorized loop | ||
162 | __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; | ||
163 | __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; | ||
164 | 2 | _mm256_store_ps(max_values_buffer, max_values); | |
165 | 2 | _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); | |
166 | |||
167 | 2 | float max = 0.f; | |
168 | 2 | uint32_t index = 0; | |
169 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
170 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (max_values_buffer[i] > max) { |
171 | 8 | max = max_values_buffer[i]; | |
172 | 8 | index = max_indices_buffer[i]; | |
173 | } | ||
174 | } | ||
175 | |||
176 | // handle tail not processed by the vectorized loop | ||
177 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
178 | 14 | const float abs_squared = | |
179 | 14 | lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); | |
180 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared > max) { |
181 | ✗ | max = abs_squared; | |
182 | ✗ | index = i; | |
183 | } | ||
184 | 14 | ++src0; | |
185 | } | ||
186 | |||
187 | 2 | *target = index; | |
188 | 2 | } | |
189 | |||
190 | #endif /*LV_HAVE_AVX2*/ | ||
191 | |||
192 | #ifdef LV_HAVE_SSE3 | ||
193 | #include <pmmintrin.h> | ||
194 | #include <xmmintrin.h> | ||
195 | |||
196 | static inline void | ||
197 | 2 | volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) | |
198 | { | ||
199 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
200 | 2 | const uint32_t num_bytes = num_points * 8; | |
201 | |||
202 | union bit128 holderf; | ||
203 | union bit128 holderi; | ||
204 | 2 | float sq_dist = 0.0; | |
205 | |||
206 | union bit128 xmm5, xmm4; | ||
207 | __m128 xmm1, xmm2, xmm3; | ||
208 | __m128i xmm8, xmm11, xmm12, xmm9, xmm10; | ||
209 | |||
210 | 2 | xmm5.int_vec = _mm_setzero_si128(); | |
211 | 2 | xmm4.int_vec = _mm_setzero_si128(); | |
212 | 2 | holderf.int_vec = _mm_setzero_si128(); | |
213 | 2 | holderi.int_vec = _mm_setzero_si128(); | |
214 | |||
215 | 2 | int bound = num_bytes >> 5; | |
216 | 2 | int i = 0; | |
217 | |||
218 | 2 | xmm8 = _mm_setr_epi32(0, 1, 2, 3); | |
219 | 2 | xmm9 = _mm_setzero_si128(); | |
220 | 2 | xmm10 = _mm_setr_epi32(4, 4, 4, 4); | |
221 | 2 | xmm3 = _mm_setzero_ps(); | |
222 | |||
223 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; i < bound; ++i) { |
224 | 32766 | xmm1 = _mm_load_ps((float*)src0); | |
225 | 32766 | xmm2 = _mm_load_ps((float*)&src0[2]); | |
226 | |||
227 | 32766 | src0 += 4; | |
228 | |||
229 | 65532 | xmm1 = _mm_mul_ps(xmm1, xmm1); | |
230 | 32766 | xmm2 = _mm_mul_ps(xmm2, xmm2); | |
231 | |||
232 | 32766 | xmm1 = _mm_hadd_ps(xmm1, xmm2); | |
233 | |||
234 | 32766 | xmm3 = _mm_max_ps(xmm1, xmm3); | |
235 | |||
236 | 32766 | xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); | |
237 | 32766 | xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); | |
238 | |||
239 | 32766 | xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); | |
240 | 65532 | xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); | |
241 | |||
242 | 32766 | xmm9 = _mm_add_epi32(xmm11, xmm12); | |
243 | |||
244 | 65532 | xmm8 = _mm_add_epi32(xmm8, xmm10); | |
245 | } | ||
246 | |||
247 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 4 & 1) { |
248 | 2 | xmm2 = _mm_load_ps((float*)src0); | |
249 | |||
250 | 2 | xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); | |
251 | 2 | xmm8 = bit128_p(&xmm1)->int_vec; | |
252 | |||
253 | 2 | xmm2 = _mm_mul_ps(xmm2, xmm2); | |
254 | |||
255 | 2 | src0 += 2; | |
256 | |||
257 | 2 | xmm1 = _mm_hadd_ps(xmm2, xmm2); | |
258 | |||
259 | 4 | xmm3 = _mm_max_ps(xmm1, xmm3); | |
260 | |||
261 | 2 | xmm10 = _mm_setr_epi32(2, 2, 2, 2); | |
262 | |||
263 | 2 | xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); | |
264 | 2 | xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); | |
265 | |||
266 | 2 | xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); | |
267 | 4 | xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); | |
268 | |||
269 | 2 | xmm9 = _mm_add_epi32(xmm11, xmm12); | |
270 | |||
271 | 4 | xmm8 = _mm_add_epi32(xmm8, xmm10); | |
272 | } | ||
273 | |||
274 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 3 & 1) { |
275 | 2 | sq_dist = | |
276 | 2 | lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); | |
277 | |||
278 | 2 | xmm2 = _mm_load1_ps(&sq_dist); | |
279 | |||
280 | 2 | xmm1 = xmm3; | |
281 | |||
282 | 2 | xmm3 = _mm_max_ss(xmm3, xmm2); | |
283 | |||
284 | 2 | xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); | |
285 | 2 | xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); | |
286 | |||
287 | 2 | xmm8 = _mm_shuffle_epi32(xmm8, 0x00); | |
288 | |||
289 | 2 | xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); | |
290 | 4 | xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); | |
291 | |||
292 | 2 | xmm9 = _mm_add_epi32(xmm11, xmm12); | |
293 | } | ||
294 | |||
295 | _mm_store_ps((float*)&(holderf.f), xmm3); | ||
296 | _mm_store_si128(&(holderi.int_vec), xmm9); | ||
297 | |||
298 | 2 | target[0] = holderi.i[0]; | |
299 | 2 | sq_dist = holderf.f[0]; | |
300 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; |
301 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; |
302 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; |
303 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; |
304 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; |
305 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; |
306 | 2 | } | |
307 | |||
308 | #endif /*LV_HAVE_SSE3*/ | ||
309 | |||
310 | #ifdef LV_HAVE_GENERIC | ||
311 | static inline void | ||
312 | 2 | volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) | |
313 | { | ||
314 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
315 | |||
316 | 2 | const uint32_t num_bytes = num_points * 8; | |
317 | |||
318 | 2 | float sq_dist = 0.0; | |
319 | 2 | float max = 0.0; | |
320 | 2 | uint16_t index = 0; | |
321 | |||
322 | 2 | uint32_t i = 0; | |
323 | |||
324 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; i<num_bytes>> 3; ++i) { |
325 | 131070 | sq_dist = | |
326 | 131070 | lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); | |
327 | |||
328 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 131042 times.
|
131070 | if (sq_dist > max) { |
329 | 28 | index = i; | |
330 | 28 | max = sq_dist; | |
331 | } | ||
332 | } | ||
333 | 2 | target[0] = index; | |
334 | 2 | } | |
335 | |||
336 | #endif /*LV_HAVE_GENERIC*/ | ||
337 | |||
338 | #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/ | ||
339 | |||
340 | #ifndef INCLUDED_volk_32fc_index_max_16u_u_H | ||
341 | #define INCLUDED_volk_32fc_index_max_16u_u_H | ||
342 | |||
343 | #include <inttypes.h> | ||
344 | #include <limits.h> | ||
345 | #include <stdio.h> | ||
346 | #include <volk/volk_common.h> | ||
347 | #include <volk/volk_complex.h> | ||
348 | |||
349 | #ifdef LV_HAVE_AVX2 | ||
350 | #include <immintrin.h> | ||
351 | #include <volk/volk_avx2_intrinsics.h> | ||
352 | |||
353 | 2 | static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target, | |
354 | lv_32fc_t* src0, | ||
355 | uint32_t num_points) | ||
356 | { | ||
357 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
358 | |||
359 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
360 | /* | ||
361 | * At the start of each loop iteration current_indices holds the indices of | ||
362 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
363 | * in implementation of vector_32fc_index_max_variant0(). | ||
364 | */ | ||
365 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
366 | |||
367 | 2 | __m256 max_values = _mm256_setzero_ps(); | |
368 | 2 | __m256i max_indices = _mm256_setzero_si256(); | |
369 | |||
370 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
371 | 16382 | __m256 in0 = _mm256_loadu_ps((float*)src0); | |
372 | 16382 | __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); | |
373 | 16382 | vector_32fc_index_max_variant0( | |
374 | in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); | ||
375 | 16382 | src0 += 8; | |
376 | } | ||
377 | |||
378 | // determine maximum value and index in the result of the vectorized loop | ||
379 | __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; | ||
380 | __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; | ||
381 | 2 | _mm256_store_ps(max_values_buffer, max_values); | |
382 | 2 | _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); | |
383 | |||
384 | 2 | float max = 0.f; | |
385 | 2 | uint32_t index = 0; | |
386 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
387 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (max_values_buffer[i] > max) { |
388 | 8 | max = max_values_buffer[i]; | |
389 | 8 | index = max_indices_buffer[i]; | |
390 | } | ||
391 | } | ||
392 | |||
393 | // handle tail not processed by the vectorized loop | ||
394 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
395 | 14 | const float abs_squared = | |
396 | 14 | lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); | |
397 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared > max) { |
398 | ✗ | max = abs_squared; | |
399 | ✗ | index = i; | |
400 | } | ||
401 | 14 | ++src0; | |
402 | } | ||
403 | |||
404 | 2 | *target = index; | |
405 | 2 | } | |
406 | |||
407 | #endif /*LV_HAVE_AVX2*/ | ||
408 | |||
409 | #ifdef LV_HAVE_AVX2 | ||
410 | #include <immintrin.h> | ||
411 | #include <volk/volk_avx2_intrinsics.h> | ||
412 | |||
413 | 2 | static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target, | |
414 | lv_32fc_t* src0, | ||
415 | uint32_t num_points) | ||
416 | { | ||
417 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
418 | |||
419 | 2 | const __m256i indices_increment = _mm256_set1_epi32(8); | |
420 | /* | ||
421 | * At the start of each loop iteration current_indices holds the indices of | ||
422 | * the complex numbers loaded from memory. Explanation for odd order is given | ||
423 | * in implementation of vector_32fc_index_max_variant0(). | ||
424 | */ | ||
425 | 2 | __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
426 | |||
427 | 2 | __m256 max_values = _mm256_setzero_ps(); | |
428 | 2 | __m256i max_indices = _mm256_setzero_si256(); | |
429 | |||
430 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (unsigned i = 0; i < num_points / 8u; ++i) { |
431 | 16382 | __m256 in0 = _mm256_loadu_ps((float*)src0); | |
432 | 16382 | __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); | |
433 | 16382 | vector_32fc_index_max_variant1( | |
434 | in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); | ||
435 | 16382 | src0 += 8; | |
436 | } | ||
437 | |||
438 | // determine maximum value and index in the result of the vectorized loop | ||
439 | __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; | ||
440 | __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; | ||
441 | 2 | _mm256_store_ps(max_values_buffer, max_values); | |
442 | 2 | _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); | |
443 | |||
444 | 2 | float max = 0.f; | |
445 | 2 | uint32_t index = 0; | |
446 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (unsigned i = 0; i < 8; i++) { |
447 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (max_values_buffer[i] > max) { |
448 | 8 | max = max_values_buffer[i]; | |
449 | 8 | index = max_indices_buffer[i]; | |
450 | } | ||
451 | } | ||
452 | |||
453 | // handle tail not processed by the vectorized loop | ||
454 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (unsigned i = num_points & (~7u); i < num_points; ++i) { |
455 | 14 | const float abs_squared = | |
456 | 14 | lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); | |
457 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (abs_squared > max) { |
458 | ✗ | max = abs_squared; | |
459 | ✗ | index = i; | |
460 | } | ||
461 | 14 | ++src0; | |
462 | } | ||
463 | |||
464 | 2 | *target = index; | |
465 | 2 | } | |
466 | |||
467 | #endif /*LV_HAVE_AVX2*/ | ||
468 | |||
469 | #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/ | ||
470 |