Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_index_max_16u | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Returns Argmax_i x[i]. Finds and returns the index which contains | ||
16 | * the fist maximum value in the given vector. | ||
17 | * | ||
18 | * Note that num_points is a uint32_t, but the return value is | ||
19 | * uint16_t. Providing a vector larger than the max of a uint16_t | ||
20 | * (65536) would miss anything outside of this boundary. The kernel | ||
21 | * will check the length of num_points and cap it to this max value, | ||
22 | * anyways. | ||
23 | * | ||
24 | * <b>Dispatcher Prototype</b> | ||
25 | * \code | ||
26 | * void volk_32f_index_max_16u(uint16_t* target, const float* src0, uint32_t num_points) | ||
27 | * \endcode | ||
28 | * | ||
29 | * \b Inputs | ||
30 | * \li src0: The input vector of floats. | ||
31 | * \li num_points: The number of data points. | ||
32 | * | ||
33 | * \b Outputs | ||
34 | * \li target: The index of the fist maximum value in the input buffer. | ||
35 | * | ||
36 | * \b Example | ||
37 | * \code | ||
38 | * int N = 10; | ||
39 | * uint32_t alignment = volk_get_alignment(); | ||
40 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
41 | * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); | ||
42 | * | ||
43 | * for(uint32_t ii = 0; ii < N; ++ii){ | ||
44 | * float x = (float)ii; | ||
45 | * // a parabola with a maximum at x=4 | ||
46 | * in[ii] = -(x-4) * (x-4) + 5; | ||
47 | * } | ||
48 | * | ||
49 | * volk_32f_index_max_16u(out, in, N); | ||
50 | * | ||
51 | * printf("maximum is %1.2f at index %u\n", in[*out], *out); | ||
52 | * | ||
53 | * volk_free(in); | ||
54 | * volk_free(out); | ||
55 | * \endcode | ||
56 | */ | ||
57 | |||
58 | #ifndef INCLUDED_volk_32f_index_max_16u_a_H | ||
59 | #define INCLUDED_volk_32f_index_max_16u_a_H | ||
60 | |||
61 | #include <inttypes.h> | ||
62 | #include <limits.h> | ||
63 | #include <stdio.h> | ||
64 | #include <volk/volk_common.h> | ||
65 | |||
66 | #ifdef LV_HAVE_AVX | ||
67 | #include <immintrin.h> | ||
68 | |||
69 | static inline void | ||
70 | 2 | volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) | |
71 | { | ||
72 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
73 | |||
74 | 2 | uint32_t number = 0; | |
75 | 2 | const uint32_t eighthPoints = num_points / 8; | |
76 | |||
77 | 2 | float* inputPtr = (float*)src0; | |
78 | |||
79 | 2 | __m256 indexIncrementValues = _mm256_set1_ps(8); | |
80 | 2 | __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); | |
81 | |||
82 | 2 | float max = src0[0]; | |
83 | 2 | float index = 0; | |
84 | 2 | __m256 maxValues = _mm256_set1_ps(max); | |
85 | 2 | __m256 maxValuesIndex = _mm256_setzero_ps(); | |
86 | __m256 compareResults; | ||
87 | __m256 currentValues; | ||
88 | |||
89 | __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; | ||
90 | __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; | ||
91 | |||
92 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < eighthPoints; number++) { |
93 | |||
94 | 16382 | currentValues = _mm256_load_ps(inputPtr); | |
95 | 16382 | inputPtr += 8; | |
96 | 16382 | currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); | |
97 | |||
98 | 16382 | compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); | |
99 | |||
100 | 16382 | maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
101 | 16382 | maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); | |
102 | } | ||
103 | |||
104 | // Calculate the largest value from the remaining 4 points | ||
105 | _mm256_store_ps(maxValuesBuffer, maxValues); | ||
106 | _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
107 | |||
108 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (number = 0; number < 8; number++) { |
109 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 10 times.
|
16 | if (maxValuesBuffer[number] > max) { |
110 | 6 | index = maxIndexesBuffer[number]; | |
111 | 6 | max = maxValuesBuffer[number]; | |
112 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
|
10 | } else if (maxValuesBuffer[number] == max) { |
113 | ✗ | if (index > maxIndexesBuffer[number]) | |
114 | ✗ | index = maxIndexesBuffer[number]; | |
115 | } | ||
116 | } | ||
117 | |||
118 | 2 | number = eighthPoints * 8; | |
119 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
120 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (src0[number] > max) { |
121 | ✗ | index = number; | |
122 | ✗ | max = src0[number]; | |
123 | } | ||
124 | } | ||
125 | 2 | target[0] = (uint16_t)index; | |
126 | 2 | } | |
127 | |||
128 | #endif /*LV_HAVE_AVX*/ | ||
129 | |||
130 | #ifdef LV_HAVE_SSE4_1 | ||
131 | #include <smmintrin.h> | ||
132 | |||
133 | static inline void | ||
134 | 2 | volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) | |
135 | { | ||
136 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
137 | |||
138 | 2 | uint32_t number = 0; | |
139 | 2 | const uint32_t quarterPoints = num_points / 4; | |
140 | |||
141 | 2 | float* inputPtr = (float*)src0; | |
142 | |||
143 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
144 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
145 | |||
146 | 2 | float max = src0[0]; | |
147 | 2 | float index = 0; | |
148 | 2 | __m128 maxValues = _mm_set1_ps(max); | |
149 | 2 | __m128 maxValuesIndex = _mm_setzero_ps(); | |
150 | __m128 compareResults; | ||
151 | __m128 currentValues; | ||
152 | |||
153 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
154 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
155 | |||
156 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < quarterPoints; number++) { |
157 | |||
158 | 32766 | currentValues = _mm_load_ps(inputPtr); | |
159 | 32766 | inputPtr += 4; | |
160 | 32766 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
161 | |||
162 | 32766 | compareResults = _mm_cmpgt_ps(currentValues, maxValues); | |
163 | |||
164 | 32766 | maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
165 | 32766 | maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); | |
166 | } | ||
167 | |||
168 | // Calculate the largest value from the remaining 4 points | ||
169 | _mm_store_ps(maxValuesBuffer, maxValues); | ||
170 | _mm_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
171 | |||
172 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; number++) { |
173 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (maxValuesBuffer[number] > max) { |
174 | 4 | index = maxIndexesBuffer[number]; | |
175 | 4 | max = maxValuesBuffer[number]; | |
176 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | } else if (maxValuesBuffer[number] == max) { |
177 | ✗ | if (index > maxIndexesBuffer[number]) | |
178 | ✗ | index = maxIndexesBuffer[number]; | |
179 | } | ||
180 | } | ||
181 | |||
182 | 2 | number = quarterPoints * 4; | |
183 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
184 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (src0[number] > max) { |
185 | ✗ | index = number; | |
186 | ✗ | max = src0[number]; | |
187 | } | ||
188 | } | ||
189 | 2 | target[0] = (uint16_t)index; | |
190 | 2 | } | |
191 | |||
192 | #endif /*LV_HAVE_SSE4_1*/ | ||
193 | |||
194 | |||
195 | #ifdef LV_HAVE_SSE | ||
196 | |||
197 | #include <xmmintrin.h> | ||
198 | |||
199 | static inline void | ||
200 | 2 | volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) | |
201 | { | ||
202 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
203 | |||
204 | 2 | uint32_t number = 0; | |
205 | 2 | const uint32_t quarterPoints = num_points / 4; | |
206 | |||
207 | 2 | float* inputPtr = (float*)src0; | |
208 | |||
209 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
210 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
211 | |||
212 | 2 | float max = src0[0]; | |
213 | 2 | float index = 0; | |
214 | 2 | __m128 maxValues = _mm_set1_ps(max); | |
215 | 2 | __m128 maxValuesIndex = _mm_setzero_ps(); | |
216 | __m128 compareResults; | ||
217 | __m128 currentValues; | ||
218 | |||
219 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
220 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
221 | |||
222 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < quarterPoints; number++) { |
223 | |||
224 | 32766 | currentValues = _mm_load_ps(inputPtr); | |
225 | 32766 | inputPtr += 4; | |
226 | 32766 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
227 | |||
228 | 32766 | compareResults = _mm_cmpgt_ps(currentValues, maxValues); | |
229 | |||
230 | 98298 | maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), | |
231 | _mm_andnot_ps(compareResults, maxValuesIndex)); | ||
232 | 98298 | maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), | |
233 | _mm_andnot_ps(compareResults, maxValues)); | ||
234 | } | ||
235 | |||
236 | // Calculate the largest value from the remaining 4 points | ||
237 | _mm_store_ps(maxValuesBuffer, maxValues); | ||
238 | _mm_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
239 | |||
240 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; number++) { |
241 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (maxValuesBuffer[number] > max) { |
242 | 4 | index = maxIndexesBuffer[number]; | |
243 | 4 | max = maxValuesBuffer[number]; | |
244 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | } else if (maxValuesBuffer[number] == max) { |
245 | ✗ | if (index > maxIndexesBuffer[number]) | |
246 | ✗ | index = maxIndexesBuffer[number]; | |
247 | } | ||
248 | } | ||
249 | |||
250 | 2 | number = quarterPoints * 4; | |
251 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
252 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (src0[number] > max) { |
253 | ✗ | index = number; | |
254 | ✗ | max = src0[number]; | |
255 | } | ||
256 | } | ||
257 | 2 | target[0] = (uint16_t)index; | |
258 | 2 | } | |
259 | |||
260 | #endif /*LV_HAVE_SSE*/ | ||
261 | |||
262 | |||
263 | #ifdef LV_HAVE_GENERIC | ||
264 | |||
265 | static inline void | ||
266 | 2 | volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) | |
267 | { | ||
268 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
269 | |||
270 | 2 | float max = src0[0]; | |
271 | 2 | uint16_t index = 0; | |
272 | |||
273 | 2 | uint32_t i = 1; | |
274 | |||
275 |
2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.
|
131070 | for (; i < num_points; ++i) { |
276 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 131051 times.
|
131068 | if (src0[i] > max) { |
277 | 17 | index = i; | |
278 | 17 | max = src0[i]; | |
279 | } | ||
280 | } | ||
281 | 2 | target[0] = index; | |
282 | 2 | } | |
283 | |||
284 | #endif /*LV_HAVE_GENERIC*/ | ||
285 | |||
286 | |||
287 | #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/ | ||
288 | |||
289 | |||
290 | #ifndef INCLUDED_volk_32f_index_max_16u_u_H | ||
291 | #define INCLUDED_volk_32f_index_max_16u_u_H | ||
292 | |||
293 | #include <inttypes.h> | ||
294 | #include <limits.h> | ||
295 | #include <stdio.h> | ||
296 | #include <volk/volk_common.h> | ||
297 | |||
298 | #ifdef LV_HAVE_AVX | ||
299 | #include <immintrin.h> | ||
300 | |||
301 | static inline void | ||
302 | 2 | volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) | |
303 | { | ||
304 | 2 | num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; | |
305 | |||
306 | 2 | uint32_t number = 0; | |
307 | 2 | const uint32_t eighthPoints = num_points / 8; | |
308 | |||
309 | 2 | float* inputPtr = (float*)src0; | |
310 | |||
311 | 2 | __m256 indexIncrementValues = _mm256_set1_ps(8); | |
312 | 2 | __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); | |
313 | |||
314 | 2 | float max = src0[0]; | |
315 | 2 | float index = 0; | |
316 | 2 | __m256 maxValues = _mm256_set1_ps(max); | |
317 | 2 | __m256 maxValuesIndex = _mm256_setzero_ps(); | |
318 | __m256 compareResults; | ||
319 | __m256 currentValues; | ||
320 | |||
321 | __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; | ||
322 | __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; | ||
323 | |||
324 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < eighthPoints; number++) { |
325 | |||
326 | 16382 | currentValues = _mm256_loadu_ps(inputPtr); | |
327 | 16382 | inputPtr += 8; | |
328 | 16382 | currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); | |
329 | |||
330 | 16382 | compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); | |
331 | |||
332 | 16382 | maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
333 | 16382 | maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); | |
334 | } | ||
335 | |||
336 | // Calculate the largest value from the remaining 4 points | ||
337 | _mm256_storeu_ps(maxValuesBuffer, maxValues); | ||
338 | _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex); | ||
339 | |||
340 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (number = 0; number < 8; number++) { |
341 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 10 times.
|
16 | if (maxValuesBuffer[number] > max) { |
342 | 6 | index = maxIndexesBuffer[number]; | |
343 | 6 | max = maxValuesBuffer[number]; | |
344 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
|
10 | } else if (maxValuesBuffer[number] == max) { |
345 | ✗ | if (index > maxIndexesBuffer[number]) | |
346 | ✗ | index = maxIndexesBuffer[number]; | |
347 | } | ||
348 | } | ||
349 | |||
350 | 2 | number = eighthPoints * 8; | |
351 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
352 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (src0[number] > max) { |
353 | ✗ | index = number; | |
354 | ✗ | max = src0[number]; | |
355 | } | ||
356 | } | ||
357 | 2 | target[0] = (uint16_t)index; | |
358 | 2 | } | |
359 | |||
360 | #endif /*LV_HAVE_AVX*/ | ||
361 | |||
362 | #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/ | ||
363 |