GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_index_max_16u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 183 191 95.8%
Functions: 6 6 100.0%
Branches: 50 62 80.6%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_index_max_16u
12 *
13 * \b Overview
14 *
15 * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
16 * maximum magnitude for complex points in the given vector.
17 *
18 * Note that num_points is a uint32_t, but the return value is
19 * uint16_t. Providing a vector larger than the max of a uint16_t
20 * (65536) would miss anything outside of this boundary. The kernel
21 * will check the length of num_points and cap it to this max value,
22 * anyways.
23 *
24 * <b>Dispatcher Prototype</b>
25 * \code
26 * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
27 * num_points) \endcode
28 *
29 * \b Inputs
30 * \li src0: The complex input vector.
31 * \li num_points: The number of samples.
32 *
33 * \b Outputs
34 * \li target: The index of the point with maximum magnitude.
35 *
36 * \b Example
37 * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
38 * the unit circle.
39 * \code
40 * int N = 10;
41 * uint32_t alignment = volk_get_alignment();
42 * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
43 * uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
44 *
45 * for(uint32_t ii = 0; ii < N/2; ++ii){
46 * float real = 2.f * ((float)ii / (float)N) - 1.f;
47 * float imag = std::sqrt(1.f - real * real);
48 * in[ii] = lv_cmake(real, imag);
49 * in[ii] = in[ii] * in[ii] + in[ii];
50 * in[N-ii] = lv_cmake(real, imag);
51 * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
52 * }
53 *
54 * volk_32fc_index_max_16u(max, in, N);
55 *
56 * printf("index of max value = %u\n", *max);
57 *
58 * volk_free(in);
59 * volk_free(max);
60 * \endcode
61 */
62
63 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64 #define INCLUDED_volk_32fc_index_max_16u_a_H
65
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
74 #include <volk/volk_avx2_intrinsics.h>
75
76 2 static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77 lv_32fc_t* src0,
78 uint32_t num_points)
79 {
80 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 2 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_max_variant0().
87 */
88 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 2 __m256 max_values = _mm256_setzero_ps();
91 2 __m256i max_indices = _mm256_setzero_si256();
92
93
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 16382 __m256 in0 = _mm256_load_ps((float*)src0);
95 16382 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
96 16382 vector_32fc_index_max_variant0(
97 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98 16382 src0 += 8;
99 }
100
101 // determine maximum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104 2 _mm256_store_ps(max_values_buffer, max_values);
105 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106
107 2 float max = 0.f;
108 2 uint32_t index = 0;
109
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
110
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
16 if (max_values_buffer[i] > max) {
111 8 max = max_values_buffer[i];
112 8 index = max_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 14 const float abs_squared =
119 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
121 max = abs_squared;
122 index = i;
123 }
124 14 ++src0;
125 }
126
127 2 *target = index;
128 2 }
129
130 #endif /*LV_HAVE_AVX2*/
131
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
134 #include <volk/volk_avx2_intrinsics.h>
135
136 2 static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137 lv_32fc_t* src0,
138 uint32_t num_points)
139 {
140 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 2 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_max_variant0().
147 */
148 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 2 __m256 max_values = _mm256_setzero_ps();
151 2 __m256i max_indices = _mm256_setzero_si256();
152
153
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 16382 __m256 in0 = _mm256_load_ps((float*)src0);
155 16382 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
156 16382 vector_32fc_index_max_variant1(
157 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158 16382 src0 += 8;
159 }
160
161 // determine maximum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164 2 _mm256_store_ps(max_values_buffer, max_values);
165 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166
167 2 float max = 0.f;
168 2 uint32_t index = 0;
169
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
170
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
16 if (max_values_buffer[i] > max) {
171 8 max = max_values_buffer[i];
172 8 index = max_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 14 const float abs_squared =
179 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
181 max = abs_squared;
182 index = i;
183 }
184 14 ++src0;
185 }
186
187 2 *target = index;
188 2 }
189
190 #endif /*LV_HAVE_AVX2*/
191
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195
196 static inline void
197 2 volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
198 {
199 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
200 2 const uint32_t num_bytes = num_points * 8;
201
202 union bit128 holderf;
203 union bit128 holderi;
204 2 float sq_dist = 0.0;
205
206 union bit128 xmm5, xmm4;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210 2 xmm5.int_vec = _mm_setzero_si128();
211 2 xmm4.int_vec = _mm_setzero_si128();
212 2 holderf.int_vec = _mm_setzero_si128();
213 2 holderi.int_vec = _mm_setzero_si128();
214
215 2 int bound = num_bytes >> 5;
216 2 int i = 0;
217
218 2 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
219 2 xmm9 = _mm_setzero_si128();
220 2 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
221 2 xmm3 = _mm_setzero_ps();
222
223
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; i < bound; ++i) {
224 32766 xmm1 = _mm_load_ps((float*)src0);
225 32766 xmm2 = _mm_load_ps((float*)&src0[2]);
226
227 32766 src0 += 4;
228
229 65532 xmm1 = _mm_mul_ps(xmm1, xmm1);
230 32766 xmm2 = _mm_mul_ps(xmm2, xmm2);
231
232 32766 xmm1 = _mm_hadd_ps(xmm1, xmm2);
233
234 32766 xmm3 = _mm_max_ps(xmm1, xmm3);
235
236 32766 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
237 32766 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
238
239 32766 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
240 65532 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
241
242 32766 xmm9 = _mm_add_epi32(xmm11, xmm12);
243
244 65532 xmm8 = _mm_add_epi32(xmm8, xmm10);
245 }
246
247
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 4 & 1) {
248 2 xmm2 = _mm_load_ps((float*)src0);
249
250 2 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
251 2 xmm8 = bit128_p(&xmm1)->int_vec;
252
253 2 xmm2 = _mm_mul_ps(xmm2, xmm2);
254
255 2 src0 += 2;
256
257 2 xmm1 = _mm_hadd_ps(xmm2, xmm2);
258
259 4 xmm3 = _mm_max_ps(xmm1, xmm3);
260
261 2 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
262
263 2 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
264 2 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
265
266 2 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
267 4 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
268
269 2 xmm9 = _mm_add_epi32(xmm11, xmm12);
270
271 4 xmm8 = _mm_add_epi32(xmm8, xmm10);
272 }
273
274
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 3 & 1) {
275 2 sq_dist =
276 2 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
277
278 2 xmm2 = _mm_load1_ps(&sq_dist);
279
280 2 xmm1 = xmm3;
281
282 2 xmm3 = _mm_max_ss(xmm3, xmm2);
283
284 2 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
285 2 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
286
287 2 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
288
289 2 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
290 4 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
291
292 2 xmm9 = _mm_add_epi32(xmm11, xmm12);
293 }
294
295 _mm_store_ps((float*)&(holderf.f), xmm3);
296 _mm_store_si128(&(holderi.int_vec), xmm9);
297
298 2 target[0] = holderi.i[0];
299 2 sq_dist = holderf.f[0];
300
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
301
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
302
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
303
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
304
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
305
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
306 2 }
307
308 #endif /*LV_HAVE_SSE3*/
309
310 #ifdef LV_HAVE_GENERIC
311 static inline void
312 2 volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
313 {
314 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315
316 2 const uint32_t num_bytes = num_points * 8;
317
318 2 float sq_dist = 0.0;
319 2 float max = 0.0;
320 2 uint16_t index = 0;
321
322 2 uint32_t i = 0;
323
324
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; i<num_bytes>> 3; ++i) {
325 131070 sq_dist =
326 131070 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
327
328
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 131042 times.
131070 if (sq_dist > max) {
329 28 index = i;
330 28 max = sq_dist;
331 }
332 }
333 2 target[0] = index;
334 2 }
335
336 #endif /*LV_HAVE_GENERIC*/
337
338 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
339
340 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
341 #define INCLUDED_volk_32fc_index_max_16u_u_H
342
343 #include <inttypes.h>
344 #include <limits.h>
345 #include <stdio.h>
346 #include <volk/volk_common.h>
347 #include <volk/volk_complex.h>
348
349 #ifdef LV_HAVE_AVX2
350 #include <immintrin.h>
351 #include <volk/volk_avx2_intrinsics.h>
352
353 2 static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
354 lv_32fc_t* src0,
355 uint32_t num_points)
356 {
357 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
358
359 2 const __m256i indices_increment = _mm256_set1_epi32(8);
360 /*
361 * At the start of each loop iteration current_indices holds the indices of
362 * the complex numbers loaded from memory. Explanation for odd order is given
363 * in implementation of vector_32fc_index_max_variant0().
364 */
365 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
366
367 2 __m256 max_values = _mm256_setzero_ps();
368 2 __m256i max_indices = _mm256_setzero_si256();
369
370
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
371 16382 __m256 in0 = _mm256_loadu_ps((float*)src0);
372 16382 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
373 16382 vector_32fc_index_max_variant0(
374 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
375 16382 src0 += 8;
376 }
377
378 // determine maximum value and index in the result of the vectorized loop
379 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
380 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
381 2 _mm256_store_ps(max_values_buffer, max_values);
382 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
383
384 2 float max = 0.f;
385 2 uint32_t index = 0;
386
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
387
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
16 if (max_values_buffer[i] > max) {
388 8 max = max_values_buffer[i];
389 8 index = max_indices_buffer[i];
390 }
391 }
392
393 // handle tail not processed by the vectorized loop
394
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
395 14 const float abs_squared =
396 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
397
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
398 max = abs_squared;
399 index = i;
400 }
401 14 ++src0;
402 }
403
404 2 *target = index;
405 2 }
406
407 #endif /*LV_HAVE_AVX2*/
408
409 #ifdef LV_HAVE_AVX2
410 #include <immintrin.h>
411 #include <volk/volk_avx2_intrinsics.h>
412
413 2 static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
414 lv_32fc_t* src0,
415 uint32_t num_points)
416 {
417 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
418
419 2 const __m256i indices_increment = _mm256_set1_epi32(8);
420 /*
421 * At the start of each loop iteration current_indices holds the indices of
422 * the complex numbers loaded from memory. Explanation for odd order is given
423 * in implementation of vector_32fc_index_max_variant0().
424 */
425 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
426
427 2 __m256 max_values = _mm256_setzero_ps();
428 2 __m256i max_indices = _mm256_setzero_si256();
429
430
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
431 16382 __m256 in0 = _mm256_loadu_ps((float*)src0);
432 16382 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
433 16382 vector_32fc_index_max_variant1(
434 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
435 16382 src0 += 8;
436 }
437
438 // determine maximum value and index in the result of the vectorized loop
439 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
440 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
441 2 _mm256_store_ps(max_values_buffer, max_values);
442 2 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
443
444 2 float max = 0.f;
445 2 uint32_t index = 0;
446
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
447
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
16 if (max_values_buffer[i] > max) {
448 8 max = max_values_buffer[i];
449 8 index = max_indices_buffer[i];
450 }
451 }
452
453 // handle tail not processed by the vectorized loop
454
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
455 14 const float abs_squared =
456 14 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
457
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared > max) {
458 max = abs_squared;
459 index = i;
460 }
461 14 ++src0;
462 }
463
464 2 *target = index;
465 2 }
466
467 #endif /*LV_HAVE_AVX2*/
468
469 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
470