GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_index_min_16u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 179 187 95.7%
Functions: 6 6 100.0%
Branches: 54 62 87.1%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_index_min_16u
12 *
13 * \b Overview
14 *
15 * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
16 * minimum magnitude for complex points in the given vector.
17 *
18 * Note that num_points is a uint32_t, but the return value is
19 * uint16_t. Providing a vector larger than the max of a uint16_t
20 * (65536) would miss anything outside of this boundary. The kernel
21 * will check the length of num_points and cap it to this max value,
22 * anyways.
23 *
24 * <b>Dispatcher Prototype</b>
25 * \code
26 * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t
27 * num_points) \endcode
28 *
29 * \b Inputs
30 * \li source: The complex input vector.
31 * \li num_points: The number of samples.
32 *
33 * \b Outputs
34 * \li target: The index of the point with minimum magnitude.
35 *
36 * \b Example
37 * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
38 * the unit circle.
39 * \code
40 * int N = 10;
41 * uint32_t alignment = volk_get_alignment();
42 * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
43 * uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
44 *
45 * for(uint32_t ii = 0; ii < N/2; ++ii){
46 * float real = 2.f * ((float)ii / (float)N) - 1.f;
47 * float imag = std::sqrt(1.f - real * real);
48 * in[ii] = lv_cmake(real, imag);
49 * in[ii] = in[ii] * in[ii] + in[ii];
50 * in[N-ii] = lv_cmake(real, imag);
51 * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
52 * }
53 *
54 * volk_32fc_index_min_16u(min, in, N);
55 *
56 * printf("index of min value = %u\n", *min);
57 *
58 * volk_free(in);
59 * volk_free(min);
60 * \endcode
61 */
62
63 #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64 #define INCLUDED_volk_32fc_index_min_16u_a_H
65
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
74 #include <volk/volk_avx2_intrinsics.h>
75
76 2 static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
77 const lv_32fc_t* source,
78 uint32_t num_points)
79 {
80 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 2 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_min_variant0().
87 */
88 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 2 __m256 min_values = _mm256_set1_ps(FLT_MAX);
91 2 __m256i min_indices = _mm256_setzero_si256();
92
93
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 16382 __m256 in0 = _mm256_load_ps((float*)source);
95 16382 __m256 in1 = _mm256_load_ps((float*)(source + 4));
96 16382 vector_32fc_index_min_variant0(
97 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
98 16382 source += 8;
99 }
100
101 // determine minimum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
104 2 _mm256_store_ps(min_values_buffer, min_values);
105 2 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
106
107 2 float min = FLT_MAX;
108 2 uint32_t index = 0;
109
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
110
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (min_values_buffer[i] < min) {
111 4 min = min_values_buffer[i];
112 4 index = min_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 14 const float abs_squared =
119 14 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
120
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared < min) {
121 min = abs_squared;
122 index = i;
123 }
124 14 ++source;
125 }
126
127 2 *target = index;
128 2 }
129
130 #endif /*LV_HAVE_AVX2*/
131
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
134 #include <volk/volk_avx2_intrinsics.h>
135
136 2 static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
137 const lv_32fc_t* source,
138 uint32_t num_points)
139 {
140 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 2 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_min_variant0().
147 */
148 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 2 __m256 min_values = _mm256_set1_ps(FLT_MAX);
151 2 __m256i min_indices = _mm256_setzero_si256();
152
153
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 16382 __m256 in0 = _mm256_load_ps((float*)source);
155 16382 __m256 in1 = _mm256_load_ps((float*)(source + 4));
156 16382 vector_32fc_index_min_variant1(
157 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
158 16382 source += 8;
159 }
160
161 // determine minimum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
164 2 _mm256_store_ps(min_values_buffer, min_values);
165 2 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
166
167 2 float min = FLT_MAX;
168 2 uint32_t index = 0;
169
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
170
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (min_values_buffer[i] < min) {
171 4 min = min_values_buffer[i];
172 4 index = min_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 14 const float abs_squared =
179 14 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
180
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared < min) {
181 min = abs_squared;
182 index = i;
183 }
184 14 ++source;
185 }
186
187 2 *target = index;
188 2 }
189
190 #endif /*LV_HAVE_AVX2*/
191
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195
196 2 static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
197 const lv_32fc_t* source,
198 uint32_t num_points)
199 {
200 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201
202 union bit128 holderf;
203 union bit128 holderi;
204 2 float sq_dist = 0.0;
205
206 union bit128 xmm5, xmm4;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210 2 xmm5.int_vec = _mm_setzero_si128();
211 2 xmm4.int_vec = _mm_setzero_si128();
212 2 holderf.int_vec = _mm_setzero_si128();
213 2 holderi.int_vec = _mm_setzero_si128();
214
215 2 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216 2 xmm9 = _mm_setzero_si128();
217 2 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218 2 xmm3 = _mm_set_ps1(FLT_MAX);
219
220 2 int bound = num_points >> 2;
221
222
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (int i = 0; i < bound; ++i) {
223 32766 xmm1 = _mm_load_ps((float*)source);
224 32766 xmm2 = _mm_load_ps((float*)&source[2]);
225
226 32766 source += 4;
227
228 65532 xmm1 = _mm_mul_ps(xmm1, xmm1);
229 32766 xmm2 = _mm_mul_ps(xmm2, xmm2);
230
231 32766 xmm1 = _mm_hadd_ps(xmm1, xmm2);
232
233 32766 xmm3 = _mm_min_ps(xmm1, xmm3);
234
235 32766 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236 32766 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237
238 32766 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239 65532 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240
241 32766 xmm9 = _mm_add_epi32(xmm11, xmm12);
242
243 65532 xmm8 = _mm_add_epi32(xmm8, xmm10);
244 }
245
246
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points >> 1 & 1) {
247 2 xmm2 = _mm_load_ps((float*)source);
248
249 2 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250 2 xmm8 = bit128_p(&xmm1)->int_vec;
251
252 2 xmm2 = _mm_mul_ps(xmm2, xmm2);
253
254 2 source += 2;
255
256 2 xmm1 = _mm_hadd_ps(xmm2, xmm2);
257
258 4 xmm3 = _mm_min_ps(xmm1, xmm3);
259
260 2 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261
262 2 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263 2 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264
265 2 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266 4 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267
268 2 xmm9 = _mm_add_epi32(xmm11, xmm12);
269
270 4 xmm8 = _mm_add_epi32(xmm8, xmm10);
271 }
272
273
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points & 1) {
274 2 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275 2 lv_cimag(source[0]) * lv_cimag(source[0]);
276
277 2 xmm2 = _mm_load1_ps(&sq_dist);
278
279 2 xmm1 = xmm3;
280
281 2 xmm3 = _mm_min_ss(xmm3, xmm2);
282
283 2 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284 2 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285
286 2 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287
288 2 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289 4 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290
291 2 xmm9 = _mm_add_epi32(xmm11, xmm12);
292 }
293
294 _mm_store_ps((float*)&(holderf.f), xmm3);
295 _mm_store_si128(&(holderi.int_vec), xmm9);
296
297 2 target[0] = holderi.i[0];
298 2 sq_dist = holderf.f[0];
299
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305 2 }
306
307 #endif /*LV_HAVE_SSE3*/
308
309 #ifdef LV_HAVE_GENERIC
310 2 static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
311 const lv_32fc_t* source,
312 uint32_t num_points)
313 {
314 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315
316 2 float sq_dist = 0.0;
317 2 float min = FLT_MAX;
318 2 uint16_t index = 0;
319
320
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (uint32_t i = 0; i < num_points; ++i) {
321 131070 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
322 131070 lv_cimag(source[i]) * lv_cimag(source[i]);
323
324
2/2
✓ Branch 0 taken 18 times.
✓ Branch 1 taken 131052 times.
131070 if (sq_dist < min) {
325 18 index = i;
326 18 min = sq_dist;
327 }
328 }
329 2 target[0] = index;
330 2 }
331
332 #endif /*LV_HAVE_GENERIC*/
333
334 #endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
335
336 #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
337 #define INCLUDED_volk_32fc_index_min_16u_u_H
338
339 #include <inttypes.h>
340 #include <limits.h>
341 #include <stdio.h>
342 #include <volk/volk_common.h>
343 #include <volk/volk_complex.h>
344
345 #ifdef LV_HAVE_AVX2
346 #include <immintrin.h>
347 #include <volk/volk_avx2_intrinsics.h>
348
349 2 static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
350 const lv_32fc_t* source,
351 uint32_t num_points)
352 {
353 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
354
355 2 const __m256i indices_increment = _mm256_set1_epi32(8);
356 /*
357 * At the start of each loop iteration current_indices holds the indices of
358 * the complex numbers loaded from memory. Explanation for odd order is given
359 * in implementation of vector_32fc_index_min_variant0().
360 */
361 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
362
363 2 __m256 min_values = _mm256_set1_ps(FLT_MAX);
364 2 __m256i min_indices = _mm256_setzero_si256();
365
366
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
367 16382 __m256 in0 = _mm256_loadu_ps((float*)source);
368 16382 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
369 16382 vector_32fc_index_min_variant0(
370 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
371 16382 source += 8;
372 }
373
374 // determine minimum value and index in the result of the vectorized loop
375 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
376 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
377 2 _mm256_store_ps(min_values_buffer, min_values);
378 2 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
379
380 2 float min = FLT_MAX;
381 2 uint32_t index = 0;
382
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
383
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (min_values_buffer[i] < min) {
384 4 min = min_values_buffer[i];
385 4 index = min_indices_buffer[i];
386 }
387 }
388
389 // handle tail not processed by the vectorized loop
390
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
391 14 const float abs_squared =
392 14 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
393
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared < min) {
394 min = abs_squared;
395 index = i;
396 }
397 14 ++source;
398 }
399
400 2 *target = index;
401 2 }
402
403 #endif /*LV_HAVE_AVX2*/
404
405 #ifdef LV_HAVE_AVX2
406 #include <immintrin.h>
407 #include <volk/volk_avx2_intrinsics.h>
408
409 2 static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
410 const lv_32fc_t* source,
411 uint32_t num_points)
412 {
413 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
414
415 2 const __m256i indices_increment = _mm256_set1_epi32(8);
416 /*
417 * At the start of each loop iteration current_indices holds the indices of
418 * the complex numbers loaded from memory. Explanation for odd order is given
419 * in implementation of vector_32fc_index_min_variant0().
420 */
421 2 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
422
423 2 __m256 min_values = _mm256_set1_ps(FLT_MAX);
424 2 __m256i min_indices = _mm256_setzero_si256();
425
426
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (unsigned i = 0; i < num_points / 8u; ++i) {
427 16382 __m256 in0 = _mm256_loadu_ps((float*)source);
428 16382 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
429 16382 vector_32fc_index_min_variant1(
430 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
431 16382 source += 8;
432 }
433
434 // determine minimum value and index in the result of the vectorized loop
435 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
436 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
437 2 _mm256_store_ps(min_values_buffer, min_values);
438 2 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
439
440 2 float min = FLT_MAX;
441 2 uint32_t index = 0;
442
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (unsigned i = 0; i < 8; i++) {
443
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
16 if (min_values_buffer[i] < min) {
444 4 min = min_values_buffer[i];
445 4 index = min_indices_buffer[i];
446 }
447 }
448
449 // handle tail not processed by the vectorized loop
450
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
451 14 const float abs_squared =
452 14 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
453
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (abs_squared < min) {
454 min = abs_squared;
455 index = i;
456 }
457 14 ++source;
458 }
459
460 2 *target = index;
461 2 }
462
463 #endif /*LV_HAVE_AVX2*/
464
465 #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
466