GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_index_max_32u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 177 185 95.7%
Functions: 6 6 100.0%
Branches: 54 62 87.1%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_index_max_32u
12 *
13 * \b Overview
14 *
15 * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
16 * maximum magnitude for complex points in the given vector.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
21 * num_points) \endcode
22 *
23 * \b Inputs
24 * \li src0: The complex input vector.
25 * \li num_points: The number of samples.
26 *
27 * \b Outputs
28 * \li target: The index of the point with maximum magnitude.
29 *
30 * \b Example
31 * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
32 * the unit circle.
33 * \code
34 * int N = 10;
35 * uint32_t alignment = volk_get_alignment();
36 * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
37 * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
38 *
39 * for(uint32_t ii = 0; ii < N/2; ++ii){
40 * float real = 2.f * ((float)ii / (float)N) - 1.f;
41 * float imag = std::sqrt(1.f - real * real);
42 * in[ii] = lv_cmake(real, imag);
43 * in[ii] = in[ii] * in[ii] + in[ii];
44 * in[N-ii] = lv_cmake(real, imag);
45 * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
46 * }
47 *
48 * volk_32fc_index_max_32u(max, in, N);
49 *
50 * printf("index of max value = %u\n", *max);
51 *
52 * volk_free(in);
53 * volk_free(max);
54 * \endcode
55 */
56
57 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58 #define INCLUDED_volk_32fc_index_max_32u_a_H
59
60 #include <inttypes.h>
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 #include <volk/volk_complex.h>
64
65 #ifdef LV_HAVE_AVX2
66 #include <immintrin.h>
67 #include <volk/volk_avx2_intrinsics.h>
68
69 2 static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
70 lv_32fc_t* src0,
71 uint32_t num_points)
72 {
73 2 const __m256i indices_increment = _mm256_set1_epi32(8);
74 /*
75 * At the start of each loop iteration current_indices holds the indices of
76 * the complex numbers loaded from memory. Explanation for odd order is given
77 * in implementation of vector_32fc_index_max_variant0().
78 */
79 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80
81 2 __m256 max_values = _mm256_setzero_ps();
82 2 __m256i max_indices = _mm256_setzero_si256();
83
84
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned i = 0; i < num_points / 8u; ++i) {
85 32766 __m256 in0 = _mm256_load_ps((float*)src0);
86 32766 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
87 32766 vector_32fc_index_max_variant0(
88 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
89 32766 src0 += 8;
90 }
91
92 // determine maximum value and index in the result of the vectorized loop
93 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
94 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
95 2 _mm256_store_ps(max_values_buffer, max_values);
96 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
97
98 2 float max = 0.f;
99 2 uint32_t index = 0;
100
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
101
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (max_values_buffer[i] > max) {
102 4 max = max_values_buffer[i];
103 4 index = max_indices_buffer[i];
104 }
105 }
106
107 // handle tail not processed by the vectorized loop
108
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109 14 const float abs_squared =
110 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
111
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
112 max = abs_squared;
113 index = i;
114 }
115 14 ++src0;
116 }
117
118 2 *target = index;
119 2 }
120
121 #endif /*LV_HAVE_AVX2*/
122
123 #ifdef LV_HAVE_AVX2
124 #include <immintrin.h>
125 #include <volk/volk_avx2_intrinsics.h>
126
127 2 static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
128 lv_32fc_t* src0,
129 uint32_t num_points)
130 {
131 2 const __m256i indices_increment = _mm256_set1_epi32(8);
132 /*
133 * At the start of each loop iteration current_indices holds the indices of
134 * the complex numbers loaded from memory. Explanation for odd order is given
135 * in implementation of vector_32fc_index_max_variant0().
136 */
137 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138
139 2 __m256 max_values = _mm256_setzero_ps();
140 2 __m256i max_indices = _mm256_setzero_si256();
141
142
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned i = 0; i < num_points / 8u; ++i) {
143 32766 __m256 in0 = _mm256_load_ps((float*)src0);
144 32766 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
145 32766 vector_32fc_index_max_variant1(
146 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
147 32766 src0 += 8;
148 }
149
150 // determine maximum value and index in the result of the vectorized loop
151 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
152 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
153 2 _mm256_store_ps(max_values_buffer, max_values);
154 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
155
156 2 float max = 0.f;
157 2 uint32_t index = 0;
158
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
159
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (max_values_buffer[i] > max) {
160 4 max = max_values_buffer[i];
161 4 index = max_indices_buffer[i];
162 }
163 }
164
165 // handle tail not processed by the vectorized loop
166
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167 14 const float abs_squared =
168 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
169
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
170 max = abs_squared;
171 index = i;
172 }
173 14 ++src0;
174 }
175
176 2 *target = index;
177 2 }
178
179 #endif /*LV_HAVE_AVX2*/
180
181 #ifdef LV_HAVE_SSE3
182 #include <pmmintrin.h>
183 #include <xmmintrin.h>
184
185 static inline void
186 2 volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
187 {
188 2 const uint32_t num_bytes = num_points * 8;
189
190 union bit128 holderf;
191 union bit128 holderi;
192 2 float sq_dist = 0.0;
193
194 union bit128 xmm5, xmm4;
195 __m128 xmm1, xmm2, xmm3;
196 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
197
198 2 xmm5.int_vec = _mm_setzero_si128();
199 2 xmm4.int_vec = _mm_setzero_si128();
200 2 holderf.int_vec = _mm_setzero_si128();
201 2 holderi.int_vec = _mm_setzero_si128();
202
203 2 int bound = num_bytes >> 5;
204 2 int i = 0;
205
206 2 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
207 2 xmm9 = _mm_setzero_si128();
208 2 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
209 2 xmm3 = _mm_setzero_ps();
210
211
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; i < bound; ++i) {
212 65534 xmm1 = _mm_load_ps((float*)src0);
213 65534 xmm2 = _mm_load_ps((float*)&src0[2]);
214
215 65534 src0 += 4;
216
217 131068 xmm1 = _mm_mul_ps(xmm1, xmm1);
218 65534 xmm2 = _mm_mul_ps(xmm2, xmm2);
219
220 65534 xmm1 = _mm_hadd_ps(xmm1, xmm2);
221
222 65534 xmm3 = _mm_max_ps(xmm1, xmm3);
223
224 65534 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
225 65534 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
226
227 65534 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
228 131068 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
229
230 65534 xmm9 = _mm_add_epi32(xmm11, xmm12);
231
232 131068 xmm8 = _mm_add_epi32(xmm8, xmm10);
233 }
234
235
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 4 & 1) {
236 2 xmm2 = _mm_load_ps((float*)src0);
237
238 2 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
239 2 xmm8 = bit128_p(&xmm1)->int_vec;
240
241 2 xmm2 = _mm_mul_ps(xmm2, xmm2);
242
243 2 src0 += 2;
244
245 2 xmm1 = _mm_hadd_ps(xmm2, xmm2);
246
247 4 xmm3 = _mm_max_ps(xmm1, xmm3);
248
249 2 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
250
251 2 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
252 2 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
253
254 2 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
255 4 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
256
257 2 xmm9 = _mm_add_epi32(xmm11, xmm12);
258
259 4 xmm8 = _mm_add_epi32(xmm8, xmm10);
260 }
261
262
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 3 & 1) {
263 2 sq_dist =
264 2 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
265
266 2 xmm2 = _mm_load1_ps(&sq_dist);
267
268 2 xmm1 = xmm3;
269
270 2 xmm3 = _mm_max_ss(xmm3, xmm2);
271
272 2 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
273 2 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
274
275 2 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
276
277 2 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
278 4 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
279
280 2 xmm9 = _mm_add_epi32(xmm11, xmm12);
281 }
282
283 _mm_store_ps((float*)&(holderf.f), xmm3);
284 _mm_store_si128(&(holderi.int_vec), xmm9);
285
286 2 target[0] = holderi.i[0];
287 2 sq_dist = holderf.f[0];
288
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
289
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
290
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
291
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
292
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
293
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
294 2 }
295
296 #endif /*LV_HAVE_SSE3*/
297
298 #ifdef LV_HAVE_GENERIC
299 static inline void
300 2 volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
301 {
302 2 const uint32_t num_bytes = num_points * 8;
303
304 2 float sq_dist = 0.0;
305 2 float max = 0.0;
306 2 uint32_t index = 0;
307
308 2 uint32_t i = 0;
309
310
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; i<num_bytes>> 3; ++i) {
311 262142 sq_dist =
312 262142 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
313
314
2/2
✓ Branch 0 taken 23 times.
✓ Branch 1 taken 262119 times.
262142 if (sq_dist > max) {
315 23 index = i;
316 23 max = sq_dist;
317 }
318 }
319 2 target[0] = index;
320 2 }
321
322 #endif /*LV_HAVE_GENERIC*/
323
324 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
325
326 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
327 #define INCLUDED_volk_32fc_index_max_32u_u_H
328
329 #include <inttypes.h>
330 #include <stdio.h>
331 #include <volk/volk_common.h>
332 #include <volk/volk_complex.h>
333
334 #ifdef LV_HAVE_AVX2
335 #include <immintrin.h>
336 #include <volk/volk_avx2_intrinsics.h>
337
338 2 static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
339 lv_32fc_t* src0,
340 uint32_t num_points)
341 {
342 2 const __m256i indices_increment = _mm256_set1_epi32(8);
343 /*
344 * At the start of each loop iteration current_indices holds the indices of
345 * the complex numbers loaded from memory. Explanation for odd order is given
346 * in implementation of vector_32fc_index_max_variant0().
347 */
348 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
349
350 2 __m256 max_values = _mm256_setzero_ps();
351 2 __m256i max_indices = _mm256_setzero_si256();
352
353
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned i = 0; i < num_points / 8u; ++i) {
354 32766 __m256 in0 = _mm256_loadu_ps((float*)src0);
355 32766 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
356 32766 vector_32fc_index_max_variant0(
357 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
358 32766 src0 += 8;
359 }
360
361 // determine maximum value and index in the result of the vectorized loop
362 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
363 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
364 2 _mm256_store_ps(max_values_buffer, max_values);
365 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
366
367 2 float max = 0.f;
368 2 uint32_t index = 0;
369
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
370
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (max_values_buffer[i] > max) {
371 4 max = max_values_buffer[i];
372 4 index = max_indices_buffer[i];
373 }
374 }
375
376 // handle tail not processed by the vectorized loop
377
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
378 14 const float abs_squared =
379 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
380
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
381 max = abs_squared;
382 index = i;
383 }
384 14 ++src0;
385 }
386
387 2 *target = index;
388 2 }
389
390 #endif /*LV_HAVE_AVX2*/
391
392 #ifdef LV_HAVE_AVX2
393 #include <immintrin.h>
394 #include <volk/volk_avx2_intrinsics.h>
395
396 2 static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
397 lv_32fc_t* src0,
398 uint32_t num_points)
399 {
400 2 const __m256i indices_increment = _mm256_set1_epi32(8);
401 /*
402 * At the start of each loop iteration current_indices holds the indices of
403 * the complex numbers loaded from memory. Explanation for odd order is given
404 * in implementation of vector_32fc_index_max_variant0().
405 */
406 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
407
408 2 __m256 max_values = _mm256_setzero_ps();
409 2 __m256i max_indices = _mm256_setzero_si256();
410
411
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned i = 0; i < num_points / 8u; ++i) {
412 32766 __m256 in0 = _mm256_loadu_ps((float*)src0);
413 32766 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
414 32766 vector_32fc_index_max_variant1(
415 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
416 32766 src0 += 8;
417 }
418
419 // determine maximum value and index in the result of the vectorized loop
420 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
421 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
422 2 _mm256_store_ps(max_values_buffer, max_values);
423 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
424
425 2 float max = 0.f;
426 2 uint32_t index = 0;
427
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
428
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (max_values_buffer[i] > max) {
429 4 max = max_values_buffer[i];
430 4 index = max_indices_buffer[i];
431 }
432 }
433
434 // handle tail not processed by the vectorized loop
435
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
436 14 const float abs_squared =
437 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
438
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
439 max = abs_squared;
440 index = i;
441 }
442 14 ++src0;
443 }
444
445 2 *target = index;
446 2 }
447
448 #endif /*LV_HAVE_AVX2*/
449
450 #ifdef LV_HAVE_NEON
451 #include <arm_neon.h>
452 #include <volk/volk_neon_intrinsics.h>
453
454 static inline void
455 volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
456 {
457 unsigned int number = 0;
458 const uint32_t quarter_points = num_points / 4;
459 const lv_32fc_t* src0Ptr = src0;
460
461 uint32_t indices[4] = { 0, 1, 2, 3 };
462 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
463 uint32x4_t vec_indices = vld1q_u32(indices);
464 uint32x4_t vec_max_indices = vec_indices;
465
466 if (num_points) {
467 float max = FLT_MIN;
468 uint32_t index = 0;
469
470 float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
471
472 for (; number < quarter_points; number++) {
473 // Load complex and compute magnitude squared
474 const float32x4_t vec_mag2 =
475 _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
476 __VOLK_PREFETCH(src0Ptr += 4);
477 // a > b?
478 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
479 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
480 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
481 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
482 }
483 uint32_t tmp_max_indices[4];
484 float tmp_max[4];
485 vst1q_u32(tmp_max_indices, vec_max_indices);
486 vst1q_f32(tmp_max, vec_max);
487
488 for (int i = 0; i < 4; i++) {
489 if (tmp_max[i] > max) {
490 max = tmp_max[i];
491 index = tmp_max_indices[i];
492 }
493 }
494
495 // Deal with the rest
496 for (number = quarter_points * 4; number < num_points; number++) {
497 const float re = lv_creal(*src0Ptr);
498 const float im = lv_cimag(*src0Ptr);
499 const float sq_dist = re * re + im * im;
500 if (sq_dist > max) {
501 max = sq_dist;
502 index = number;
503 }
504 src0Ptr++;
505 }
506 *target = index;
507 }
508 }
509
510 #endif /*LV_HAVE_NEON*/
511
512 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
513