GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_index_min_16u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 114 130 87.7%
Functions: 5 5 100.0%
Branches: 44 60 73.3%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_index_min_16u
12 *
13 * \b Overview
14 *
15 * Returns Argmin_i x[i]. Finds and returns the index which contains
16 * the fist minimum value in the given vector.
17 *
18 * Note that num_points is a uint32_t, but the return value is
19 * uint16_t. Providing a vector larger than the max of a uint16_t
20 * (65536) would miss anything outside of this boundary. The kernel
21 * will check the length of num_points and cap it to this max value,
22 * anyways.
23 *
24 * <b>Dispatcher Prototype</b>
25 * \code
26 * void volk_32f_index_min_16u(uint16_t* target, const float* source, uint32_t num_points)
27 * \endcode
28 *
29 * \b Inputs
30 * \li source: The input vector of floats.
31 * \li num_points: The number of data points.
32 *
33 * \b Outputs
34 * \li target: The index of the fist minimum value in the input buffer.
35 *
36 * \b Example
37 * \code
38 * int N = 10;
39 * uint32_t alignment = volk_get_alignment();
40 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
41 * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
42 *
43 * for(uint32_t ii = 0; ii < N; ++ii){
44 * float x = (float)ii;
45 * // a parabola with a minimum at x=4
46 * in[ii] = (x-4) * (x-4) - 5;
47 * }
48 *
49 * volk_32f_index_min_16u(out, in, N);
50 *
51 * printf("minimum is %1.2f at index %u\n", in[*out], *out);
52 *
53 * volk_free(in);
54 * volk_free(out);
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_index_min_16u_a_H
59 #define INCLUDED_volk_32f_index_min_16u_a_H
60
61 #include <inttypes.h>
62 #include <limits.h>
63 #include <stdio.h>
64 #include <volk/volk_common.h>
65
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68
69 static inline void
70 2 volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points)
71 {
72 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73 2 const uint32_t eighthPoints = num_points / 8;
74
75 2 float* inputPtr = (float*)source;
76
77 2 __m256 indexIncrementValues = _mm256_set1_ps(8);
78 2 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
79
80 2 float min = source[0];
81 2 float index = 0;
82 2 __m256 minValues = _mm256_set1_ps(min);
83 2 __m256 minValuesIndex = _mm256_setzero_ps();
84 __m256 compareResults;
85 __m256 currentValues;
86
87 __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
88 __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
89
90
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (uint32_t number = 0; number < eighthPoints; number++) {
91
92 16382 currentValues = _mm256_load_ps(inputPtr);
93 16382 inputPtr += 8;
94 16382 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
95
96 16382 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
97
98 16382 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
99 16382 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
100 }
101
102 // Calculate the smallest value from the remaining 4 points
103 _mm256_store_ps(minValuesBuffer, minValues);
104 _mm256_store_ps(minIndexesBuffer, minValuesIndex);
105
106
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (uint32_t number = 0; number < 8; number++) {
107
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 10 times.
16 if (minValuesBuffer[number] < min) {
108 6 index = minIndexesBuffer[number];
109 6 min = minValuesBuffer[number];
110
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
10 } else if (minValuesBuffer[number] == min) {
111 if (index > minIndexesBuffer[number])
112 index = minIndexesBuffer[number];
113 }
114 }
115
116
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
117
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (source[number] < min) {
118 index = number;
119 min = source[number];
120 }
121 }
122 2 target[0] = (uint16_t)index;
123 2 }
124
125 #endif /*LV_HAVE_AVX*/
126
127 #ifdef LV_HAVE_SSE4_1
128 #include <smmintrin.h>
129
130 2 static inline void volk_32f_index_min_16u_a_sse4_1(uint16_t* target,
131 const float* source,
132 uint32_t num_points)
133 {
134 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
135 2 const uint32_t quarterPoints = num_points / 4;
136
137 2 float* inputPtr = (float*)source;
138
139 2 __m128 indexIncrementValues = _mm_set1_ps(4);
140 2 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
141
142 2 float min = source[0];
143 2 float index = 0;
144 2 __m128 minValues = _mm_set1_ps(min);
145 2 __m128 minValuesIndex = _mm_setzero_ps();
146 __m128 compareResults;
147 __m128 currentValues;
148
149 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
150 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
151
152
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (uint32_t number = 0; number < quarterPoints; number++) {
153
154 32766 currentValues = _mm_load_ps(inputPtr);
155 32766 inputPtr += 4;
156 32766 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
157
158 32766 compareResults = _mm_cmplt_ps(currentValues, minValues);
159
160 32766 minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
161 32766 minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
162 }
163
164 // Calculate the smallest value from the remaining 4 points
165 _mm_store_ps(minValuesBuffer, minValues);
166 _mm_store_ps(minIndexesBuffer, minValuesIndex);
167
168
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (uint32_t number = 0; number < 4; number++) {
169
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
8 if (minValuesBuffer[number] < min) {
170 5 index = minIndexesBuffer[number];
171 5 min = minValuesBuffer[number];
172
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
3 } else if (minValuesBuffer[number] == min) {
173 if (index > minIndexesBuffer[number])
174 index = minIndexesBuffer[number];
175 }
176 }
177
178
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
179
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (source[number] < min) {
180 index = number;
181 min = source[number];
182 }
183 }
184 2 target[0] = (uint16_t)index;
185 2 }
186
187 #endif /*LV_HAVE_SSE4_1*/
188
189
190 #ifdef LV_HAVE_SSE
191
192 #include <xmmintrin.h>
193
194 static inline void
195 2 volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points)
196 {
197 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
198 2 const uint32_t quarterPoints = num_points / 4;
199
200 2 float* inputPtr = (float*)source;
201
202 2 __m128 indexIncrementValues = _mm_set1_ps(4);
203 2 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
204
205 2 float min = source[0];
206 2 float index = 0;
207 2 __m128 minValues = _mm_set1_ps(min);
208 2 __m128 minValuesIndex = _mm_setzero_ps();
209 __m128 compareResults;
210 __m128 currentValues;
211
212 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
213 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
214
215
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (uint32_t number = 0; number < quarterPoints; number++) {
216
217 32766 currentValues = _mm_load_ps(inputPtr);
218 32766 inputPtr += 4;
219 32766 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
220
221 32766 compareResults = _mm_cmplt_ps(currentValues, minValues);
222
223 98298 minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
224 _mm_andnot_ps(compareResults, minValuesIndex));
225 98298 minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
226 _mm_andnot_ps(compareResults, minValues));
227 }
228
229 // Calculate the smallest value from the remaining 4 points
230 _mm_store_ps(minValuesBuffer, minValues);
231 _mm_store_ps(minIndexesBuffer, minValuesIndex);
232
233
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (uint32_t number = 0; number < 4; number++) {
234
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
8 if (minValuesBuffer[number] < min) {
235 5 index = minIndexesBuffer[number];
236 5 min = minValuesBuffer[number];
237
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
3 } else if (minValuesBuffer[number] == min) {
238 if (index > minIndexesBuffer[number])
239 index = minIndexesBuffer[number];
240 }
241 }
242
243
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
244
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (source[number] < min) {
245 index = number;
246 min = source[number];
247 }
248 }
249 2 target[0] = (uint16_t)index;
250 2 }
251
252 #endif /*LV_HAVE_SSE*/
253
254
255 #ifdef LV_HAVE_GENERIC
256
257 static inline void
258 2 volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points)
259 {
260 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
261
262 2 float min = source[0];
263 2 uint16_t index = 0;
264
265
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.
131070 for (uint32_t i = 1; i < num_points; ++i) {
266
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 131044 times.
131068 if (source[i] < min) {
267 24 index = i;
268 24 min = source[i];
269 }
270 }
271 2 target[0] = index;
272 2 }
273
274 #endif /*LV_HAVE_GENERIC*/
275
276
277 #endif /*INCLUDED_volk_32f_index_min_16u_a_H*/
278
279
280 #ifndef INCLUDED_volk_32f_index_min_16u_u_H
281 #define INCLUDED_volk_32f_index_min_16u_u_H
282
283 #include <inttypes.h>
284 #include <limits.h>
285 #include <stdio.h>
286 #include <volk/volk_common.h>
287
288 #ifdef LV_HAVE_AVX
289 #include <immintrin.h>
290
291 static inline void
292 2 volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points)
293 {
294 2 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
295 2 const uint32_t eighthPoints = num_points / 8;
296
297 2 float* inputPtr = (float*)source;
298
299 2 __m256 indexIncrementValues = _mm256_set1_ps(8);
300 2 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
301
302 2 float min = source[0];
303 2 float index = 0;
304 2 __m256 minValues = _mm256_set1_ps(min);
305 2 __m256 minValuesIndex = _mm256_setzero_ps();
306 __m256 compareResults;
307 __m256 currentValues;
308
309 __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
310 __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
311
312
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (uint32_t number = 0; number < eighthPoints; number++) {
313
314 16382 currentValues = _mm256_loadu_ps(inputPtr);
315 16382 inputPtr += 8;
316 16382 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
317
318 16382 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
319
320 16382 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
321 16382 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
322 }
323
324 // Calculate the smallest value from the remaining 4 points
325 _mm256_storeu_ps(minValuesBuffer, minValues);
326 _mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
327
328
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (uint32_t number = 0; number < 8; number++) {
329
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 10 times.
16 if (minValuesBuffer[number] < min) {
330 6 index = minIndexesBuffer[number];
331 6 min = minValuesBuffer[number];
332
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
10 } else if (minValuesBuffer[number] == min) {
333 if (index > minIndexesBuffer[number])
334 index = minIndexesBuffer[number];
335 }
336 }
337
338
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
339
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 if (source[number] < min) {
340 index = number;
341 min = source[number];
342 }
343 }
344 2 target[0] = (uint16_t)index;
345 2 }
346
347 #endif /*LV_HAVE_AVX*/
348
349 #endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
350