GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_index_max_16u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 123 139 88.5%
Functions: 5 5 100.0%
Branches: 44 60 73.3%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_index_max_16u
12 *
13 * \b Overview
14 *
15 * Returns Argmax_i x[i]. Finds and returns the index which contains
16 * the fist maximum value in the given vector.
17 *
18 * Note that num_points is a uint32_t, but the return value is
19 * uint16_t. Providing a vector larger than the max of a uint16_t
20 * (65536) would miss anything outside of this boundary. The kernel
21 * will check the length of num_points and cap it to this max value,
22 * anyways.
23 *
24 * <b>Dispatcher Prototype</b>
25 * \code
26 * void volk_32f_index_max_16u(uint16_t* target, const float* src0, uint32_t num_points)
27 * \endcode
28 *
29 * \b Inputs
30 * \li src0: The input vector of floats.
31 * \li num_points: The number of data points.
32 *
33 * \b Outputs
34 * \li target: The index of the fist maximum value in the input buffer.
35 *
36 * \b Example
37 * \code
38 * int N = 10;
39 * uint32_t alignment = volk_get_alignment();
40 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
41 * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
42 *
43 * for(uint32_t ii = 0; ii < N; ++ii){
44 * float x = (float)ii;
45 * // a parabola with a maximum at x=4
46 * in[ii] = -(x-4) * (x-4) + 5;
47 * }
48 *
49 * volk_32f_index_max_16u(out, in, N);
50 *
51 * printf("maximum is %1.2f at index %u\n", in[*out], *out);
52 *
53 * volk_free(in);
54 * volk_free(out);
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
59 #define INCLUDED_volk_32f_index_max_16u_a_H
60
61 #include <inttypes.h>
62 #include <limits.h>
63 #include <stdio.h>
64 #include <volk/volk_common.h>
65
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68
69 static inline void
70 2 volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
71 {
72 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73
74 2 uint32_t number = 0;
75 2 const uint32_t eighthPoints = num_points / 8;
76
77 2 float* inputPtr = (float*)src0;
78
79 2 __m256 indexIncrementValues = _mm256_set1_ps(8);
80 2 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
81
82 2 float max = src0[0];
83 2 float index = 0;
84 2 __m256 maxValues = _mm256_set1_ps(max);
85 2 __m256 maxValuesIndex = _mm256_setzero_ps();
86 __m256 compareResults;
87 __m256 currentValues;
88
89 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
90 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
91
92
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < eighthPoints; number++) {
93
94 16382 currentValues = _mm256_load_ps(inputPtr);
95 16382 inputPtr += 8;
96 16382 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
97
98 16382 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
99
100 16382 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
101 16382 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
102 }
103
104 // Calculate the largest value from the remaining 4 points
105 _mm256_store_ps(maxValuesBuffer, maxValues);
106 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
107
108
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (number = 0; number < 8; number++) {
109
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 10 times.
16 if (maxValuesBuffer[number] > max) {
110 6 index = maxIndexesBuffer[number];
111 6 max = maxValuesBuffer[number];
112
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
10 } else if (maxValuesBuffer[number] == max) {
113 if (index > maxIndexesBuffer[number])
114 index = maxIndexesBuffer[number];
115 }
116 }
117
118 2 number = eighthPoints * 8;
119
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
120
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (src0[number] > max) {
121 index = number;
122 max = src0[number];
123 }
124 }
125 2 target[0] = (uint16_t)index;
126 2 }
127
128 #endif /*LV_HAVE_AVX*/
129
130 #ifdef LV_HAVE_SSE4_1
131 #include <smmintrin.h>
132
133 static inline void
134 2 volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
135 {
136 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
137
138 2 uint32_t number = 0;
139 2 const uint32_t quarterPoints = num_points / 4;
140
141 2 float* inputPtr = (float*)src0;
142
143 2 __m128 indexIncrementValues = _mm_set1_ps(4);
144 2 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
145
146 2 float max = src0[0];
147 2 float index = 0;
148 2 __m128 maxValues = _mm_set1_ps(max);
149 2 __m128 maxValuesIndex = _mm_setzero_ps();
150 __m128 compareResults;
151 __m128 currentValues;
152
153 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
154 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
155
156
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < quarterPoints; number++) {
157
158 32766 currentValues = _mm_load_ps(inputPtr);
159 32766 inputPtr += 4;
160 32766 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
161
162 32766 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
163
164 32766 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
165 32766 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
166 }
167
168 // Calculate the largest value from the remaining 4 points
169 _mm_store_ps(maxValuesBuffer, maxValues);
170 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
171
172
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (number = 0; number < 4; number++) {
173
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (maxValuesBuffer[number] > max) {
174 4 index = maxIndexesBuffer[number];
175 4 max = maxValuesBuffer[number];
176
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
4 } else if (maxValuesBuffer[number] == max) {
177 if (index > maxIndexesBuffer[number])
178 index = maxIndexesBuffer[number];
179 }
180 }
181
182 2 number = quarterPoints * 4;
183
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
184
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (src0[number] > max) {
185 index = number;
186 max = src0[number];
187 }
188 }
189 2 target[0] = (uint16_t)index;
190 2 }
191
192 #endif /*LV_HAVE_SSE4_1*/
193
194
195 #ifdef LV_HAVE_SSE
196
197 #include <xmmintrin.h>
198
199 static inline void
200 2 volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
201 {
202 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
203
204 2 uint32_t number = 0;
205 2 const uint32_t quarterPoints = num_points / 4;
206
207 2 float* inputPtr = (float*)src0;
208
209 2 __m128 indexIncrementValues = _mm_set1_ps(4);
210 2 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
211
212 2 float max = src0[0];
213 2 float index = 0;
214 2 __m128 maxValues = _mm_set1_ps(max);
215 2 __m128 maxValuesIndex = _mm_setzero_ps();
216 __m128 compareResults;
217 __m128 currentValues;
218
219 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
220 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
221
222
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < quarterPoints; number++) {
223
224 32766 currentValues = _mm_load_ps(inputPtr);
225 32766 inputPtr += 4;
226 32766 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
227
228 32766 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
229
230 98298 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
231 _mm_andnot_ps(compareResults, maxValuesIndex));
232 98298 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
233 _mm_andnot_ps(compareResults, maxValues));
234 }
235
236 // Calculate the largest value from the remaining 4 points
237 _mm_store_ps(maxValuesBuffer, maxValues);
238 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
239
240
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (number = 0; number < 4; number++) {
241
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (maxValuesBuffer[number] > max) {
242 4 index = maxIndexesBuffer[number];
243 4 max = maxValuesBuffer[number];
244
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
4 } else if (maxValuesBuffer[number] == max) {
245 if (index > maxIndexesBuffer[number])
246 index = maxIndexesBuffer[number];
247 }
248 }
249
250 2 number = quarterPoints * 4;
251
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
252
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (src0[number] > max) {
253 index = number;
254 max = src0[number];
255 }
256 }
257 2 target[0] = (uint16_t)index;
258 2 }
259
260 #endif /*LV_HAVE_SSE*/
261
262
263 #ifdef LV_HAVE_GENERIC
264
265 static inline void
266 2 volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
267 {
268 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
269
270 2 float max = src0[0];
271 2 uint16_t index = 0;
272
273 2 uint32_t i = 1;
274
275
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.
131070 for (; i < num_points; ++i) {
276
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 131051 times.
131068 if (src0[i] > max) {
277 17 index = i;
278 17 max = src0[i];
279 }
280 }
281 2 target[0] = index;
282 2 }
283
284 #endif /*LV_HAVE_GENERIC*/
285
286
287 #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
288
289
290 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
291 #define INCLUDED_volk_32f_index_max_16u_u_H
292
293 #include <inttypes.h>
294 #include <limits.h>
295 #include <stdio.h>
296 #include <volk/volk_common.h>
297
298 #ifdef LV_HAVE_AVX
299 #include <immintrin.h>
300
301 static inline void
302 2 volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
303 {
304 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
305
306 2 uint32_t number = 0;
307 2 const uint32_t eighthPoints = num_points / 8;
308
309 2 float* inputPtr = (float*)src0;
310
311 2 __m256 indexIncrementValues = _mm256_set1_ps(8);
312 2 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
313
314 2 float max = src0[0];
315 2 float index = 0;
316 2 __m256 maxValues = _mm256_set1_ps(max);
317 2 __m256 maxValuesIndex = _mm256_setzero_ps();
318 __m256 compareResults;
319 __m256 currentValues;
320
321 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
322 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
323
324
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < eighthPoints; number++) {
325
326 16382 currentValues = _mm256_loadu_ps(inputPtr);
327 16382 inputPtr += 8;
328 16382 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
329
330 16382 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
331
332 16382 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
333 16382 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
334 }
335
336 // Calculate the largest value from the remaining 4 points
337 _mm256_storeu_ps(maxValuesBuffer, maxValues);
338 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
339
340
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (number = 0; number < 8; number++) {
341
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 10 times.
16 if (maxValuesBuffer[number] > max) {
342 6 index = maxIndexesBuffer[number];
343 6 max = maxValuesBuffer[number];
344
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
10 } else if (maxValuesBuffer[number] == max) {
345 if (index > maxIndexesBuffer[number])
346 index = maxIndexesBuffer[number];
347 }
348 }
349
350 2 number = eighthPoints * 8;
351
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
352
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (src0[number] > max) {
353 index = number;
354 max = src0[number];
355 }
356 }
357 2 target[0] = (uint16_t)index;
358 2 }
359
360 #endif /*LV_HAVE_AVX*/
361
362 #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
363