Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2016 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_index_max_32u | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum | ||
16 | * value in the given vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_index_max_32u(uint32_t* target, const float* src0, uint32_t num_points) | ||
21 | * \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li src0: The input vector of floats. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li target: The index of the first maximum value in the input buffer. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10; | ||
33 | * uint32_t alignment = volk_get_alignment(); | ||
34 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
35 | * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment); | ||
36 | * | ||
37 | * for(uint32_t ii = 0; ii < N; ++ii){ | ||
38 | * float x = (float)ii; | ||
39 | * // a parabola with a maximum at x=4 | ||
40 | * in[ii] = -(x-4) * (x-4) + 5; | ||
41 | * } | ||
42 | * | ||
43 | * volk_32f_index_max_32u(out, in, N); | ||
44 | * | ||
45 | * printf("maximum is %1.2f at index %u\n", in[*out], *out); | ||
46 | * | ||
47 | * volk_free(in); | ||
48 | * volk_free(out); | ||
49 | * \endcode | ||
50 | */ | ||
51 | |||
52 | #ifndef INCLUDED_volk_32f_index_max_32u_a_H | ||
53 | #define INCLUDED_volk_32f_index_max_32u_a_H | ||
54 | |||
55 | #include <inttypes.h> | ||
56 | #include <stdio.h> | ||
57 | #include <volk/volk_common.h> | ||
58 | |||
59 | #ifdef LV_HAVE_SSE4_1 | ||
60 | #include <smmintrin.h> | ||
61 | |||
62 | static inline void | ||
63 | 2 | volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) | |
64 | { | ||
65 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
66 | 2 | uint32_t number = 0; | |
67 | 2 | const uint32_t quarterPoints = num_points / 4; | |
68 | |||
69 | 2 | float* inputPtr = (float*)src0; | |
70 | |||
71 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
72 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
73 | |||
74 | 2 | float max = src0[0]; | |
75 | 2 | float index = 0; | |
76 | 2 | __m128 maxValues = _mm_set1_ps(max); | |
77 | 2 | __m128 maxValuesIndex = _mm_setzero_ps(); | |
78 | __m128 compareResults; | ||
79 | __m128 currentValues; | ||
80 | |||
81 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
82 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
83 | |||
84 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
85 | |||
86 | 65534 | currentValues = _mm_load_ps(inputPtr); | |
87 | 65534 | inputPtr += 4; | |
88 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
89 | |||
90 | 65534 | compareResults = _mm_cmpgt_ps(currentValues, maxValues); | |
91 | |||
92 | maxValuesIndex = | ||
93 | 65534 | _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
94 | 65534 | maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); | |
95 | } | ||
96 | |||
97 | // Calculate the largest value from the remaining 4 points | ||
98 | _mm_store_ps(maxValuesBuffer, maxValues); | ||
99 | _mm_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
100 | |||
101 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; number++) { |
102 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (maxValuesBuffer[number] > max) { |
103 | 3 | index = maxIndexesBuffer[number]; | |
104 | 3 | max = maxValuesBuffer[number]; | |
105 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (maxValuesBuffer[number] == max) { |
106 | ✗ | if (index > maxIndexesBuffer[number]) | |
107 | ✗ | index = maxIndexesBuffer[number]; | |
108 | } | ||
109 | } | ||
110 | |||
111 | 2 | number = quarterPoints * 4; | |
112 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
113 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (src0[number] > max) { |
114 | ✗ | index = number; | |
115 | ✗ | max = src0[number]; | |
116 | } | ||
117 | } | ||
118 | 2 | target[0] = (uint32_t)index; | |
119 | } | ||
120 | 2 | } | |
121 | |||
122 | #endif /*LV_HAVE_SSE4_1*/ | ||
123 | |||
124 | |||
125 | #ifdef LV_HAVE_SSE | ||
126 | |||
127 | #include <xmmintrin.h> | ||
128 | |||
129 | static inline void | ||
130 | 2 | volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) | |
131 | { | ||
132 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
133 | 2 | uint32_t number = 0; | |
134 | 2 | const uint32_t quarterPoints = num_points / 4; | |
135 | |||
136 | 2 | float* inputPtr = (float*)src0; | |
137 | |||
138 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
139 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
140 | |||
141 | 2 | float max = src0[0]; | |
142 | 2 | float index = 0; | |
143 | 2 | __m128 maxValues = _mm_set1_ps(max); | |
144 | 2 | __m128 maxValuesIndex = _mm_setzero_ps(); | |
145 | __m128 compareResults; | ||
146 | __m128 currentValues; | ||
147 | |||
148 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
149 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
150 | |||
151 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
152 | |||
153 | 65534 | currentValues = _mm_load_ps(inputPtr); | |
154 | 65534 | inputPtr += 4; | |
155 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
156 | |||
157 | 65534 | compareResults = _mm_cmpgt_ps(currentValues, maxValues); | |
158 | |||
159 | 196602 | maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), | |
160 | _mm_andnot_ps(compareResults, maxValuesIndex)); | ||
161 | |||
162 | 196602 | maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), | |
163 | _mm_andnot_ps(compareResults, maxValues)); | ||
164 | } | ||
165 | |||
166 | // Calculate the largest value from the remaining 4 points | ||
167 | _mm_store_ps(maxValuesBuffer, maxValues); | ||
168 | _mm_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
169 | |||
170 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; number++) { |
171 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (maxValuesBuffer[number] > max) { |
172 | 3 | index = maxIndexesBuffer[number]; | |
173 | 3 | max = maxValuesBuffer[number]; | |
174 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (maxValuesBuffer[number] == max) { |
175 | ✗ | if (index > maxIndexesBuffer[number]) | |
176 | ✗ | index = maxIndexesBuffer[number]; | |
177 | } | ||
178 | } | ||
179 | |||
180 | 2 | number = quarterPoints * 4; | |
181 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
182 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (src0[number] > max) { |
183 | ✗ | index = number; | |
184 | ✗ | max = src0[number]; | |
185 | } | ||
186 | } | ||
187 | 2 | target[0] = (uint32_t)index; | |
188 | } | ||
189 | 2 | } | |
190 | |||
191 | #endif /*LV_HAVE_SSE*/ | ||
192 | |||
193 | |||
194 | #ifdef LV_HAVE_AVX | ||
195 | #include <immintrin.h> | ||
196 | |||
197 | static inline void | ||
198 | 2 | volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) | |
199 | { | ||
200 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
201 | 2 | uint32_t number = 0; | |
202 | 2 | const uint32_t quarterPoints = num_points / 8; | |
203 | |||
204 | 2 | float* inputPtr = (float*)src0; | |
205 | |||
206 | 2 | __m256 indexIncrementValues = _mm256_set1_ps(8); | |
207 | 2 | __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); | |
208 | |||
209 | 2 | float max = src0[0]; | |
210 | 2 | float index = 0; | |
211 | 2 | __m256 maxValues = _mm256_set1_ps(max); | |
212 | 2 | __m256 maxValuesIndex = _mm256_setzero_ps(); | |
213 | __m256 compareResults; | ||
214 | __m256 currentValues; | ||
215 | |||
216 | __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; | ||
217 | __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; | ||
218 | |||
219 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < quarterPoints; number++) { |
220 | 32766 | currentValues = _mm256_load_ps(inputPtr); | |
221 | 32766 | inputPtr += 8; | |
222 | 32766 | currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); | |
223 | 32766 | compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); | |
224 | maxValuesIndex = | ||
225 | 32766 | _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
226 | 32766 | maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); | |
227 | } | ||
228 | |||
229 | // Calculate the largest value from the remaining 8 points | ||
230 | _mm256_store_ps(maxValuesBuffer, maxValues); | ||
231 | _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
232 | |||
233 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (number = 0; number < 8; number++) { |
234 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
|
16 | if (maxValuesBuffer[number] > max) { |
235 | 4 | index = maxIndexesBuffer[number]; | |
236 | 4 | max = maxValuesBuffer[number]; | |
237 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
|
12 | } else if (maxValuesBuffer[number] == max) { |
238 | ✗ | if (index > maxIndexesBuffer[number]) | |
239 | ✗ | index = maxIndexesBuffer[number]; | |
240 | } | ||
241 | } | ||
242 | |||
243 | 2 | number = quarterPoints * 8; | |
244 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
245 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (src0[number] > max) { |
246 | ✗ | index = number; | |
247 | ✗ | max = src0[number]; | |
248 | } | ||
249 | } | ||
250 | 2 | target[0] = (uint32_t)index; | |
251 | } | ||
252 | 2 | } | |
253 | |||
254 | #endif /*LV_HAVE_AVX*/ | ||
255 | |||
256 | |||
257 | #ifdef LV_HAVE_NEON | ||
258 | #include <arm_neon.h> | ||
259 | |||
260 | static inline void | ||
261 | volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) | ||
262 | { | ||
263 | if (num_points > 0) { | ||
264 | uint32_t number = 0; | ||
265 | const uint32_t quarterPoints = num_points / 4; | ||
266 | |||
267 | float* inputPtr = (float*)src0; | ||
268 | float32x4_t indexIncrementValues = vdupq_n_f32(4); | ||
269 | __VOLK_ATTR_ALIGNED(16) | ||
270 | float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; | ||
271 | float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); | ||
272 | |||
273 | float max = src0[0]; | ||
274 | float index = 0; | ||
275 | float32x4_t maxValues = vdupq_n_f32(max); | ||
276 | uint32x4_t maxValuesIndex = vmovq_n_u32(0); | ||
277 | uint32x4_t compareResults; | ||
278 | uint32x4_t currentIndexes_u; | ||
279 | float32x4_t currentValues; | ||
280 | |||
281 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
282 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
283 | |||
284 | for (; number < quarterPoints; number++) { | ||
285 | currentValues = vld1q_f32(inputPtr); | ||
286 | inputPtr += 4; | ||
287 | currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); | ||
288 | currentIndexes_u = vcvtq_u32_f32(currentIndexes); | ||
289 | compareResults = vcleq_f32(currentValues, maxValues); | ||
290 | maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), | ||
291 | vbicq_u32(currentIndexes_u, compareResults)); | ||
292 | maxValues = vmaxq_f32(currentValues, maxValues); | ||
293 | } | ||
294 | |||
295 | // Calculate the largest value from the remaining 4 points | ||
296 | vst1q_f32(maxValuesBuffer, maxValues); | ||
297 | vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); | ||
298 | for (number = 0; number < 4; number++) { | ||
299 | if (maxValuesBuffer[number] > max) { | ||
300 | index = maxIndexesBuffer[number]; | ||
301 | max = maxValuesBuffer[number]; | ||
302 | } else if (maxValues[number] == max) { | ||
303 | if (index > maxIndexesBuffer[number]) | ||
304 | index = maxIndexesBuffer[number]; | ||
305 | } | ||
306 | } | ||
307 | |||
308 | number = quarterPoints * 4; | ||
309 | for (; number < num_points; number++) { | ||
310 | if (src0[number] > max) { | ||
311 | index = number; | ||
312 | max = src0[number]; | ||
313 | } | ||
314 | } | ||
315 | target[0] = (uint32_t)index; | ||
316 | } | ||
317 | } | ||
318 | |||
319 | #endif /*LV_HAVE_NEON*/ | ||
320 | |||
321 | |||
322 | #ifdef LV_HAVE_GENERIC | ||
323 | |||
324 | static inline void | ||
325 | 2 | volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) | |
326 | { | ||
327 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
328 | 2 | float max = src0[0]; | |
329 | 2 | uint32_t index = 0; | |
330 | |||
331 | 2 | uint32_t i = 1; | |
332 | |||
333 |
2/2✓ Branch 0 taken 262140 times.
✓ Branch 1 taken 2 times.
|
262142 | for (; i < num_points; ++i) { |
334 |
2/2✓ Branch 0 taken 36 times.
✓ Branch 1 taken 262104 times.
|
262140 | if (src0[i] > max) { |
335 | 36 | index = i; | |
336 | 36 | max = src0[i]; | |
337 | } | ||
338 | } | ||
339 | 2 | target[0] = index; | |
340 | } | ||
341 | 2 | } | |
342 | |||
343 | #endif /*LV_HAVE_GENERIC*/ | ||
344 | |||
345 | |||
346 | #endif /*INCLUDED_volk_32f_index_max_32u_a_H*/ | ||
347 | |||
348 | |||
349 | #ifndef INCLUDED_volk_32f_index_max_32u_u_H | ||
350 | #define INCLUDED_volk_32f_index_max_32u_u_H | ||
351 | |||
352 | #include <inttypes.h> | ||
353 | #include <stdio.h> | ||
354 | #include <volk/volk_common.h> | ||
355 | |||
356 | |||
357 | #ifdef LV_HAVE_AVX | ||
358 | #include <immintrin.h> | ||
359 | |||
360 | static inline void | ||
361 | 2 | volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) | |
362 | { | ||
363 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
364 | 2 | uint32_t number = 0; | |
365 | 2 | const uint32_t quarterPoints = num_points / 8; | |
366 | |||
367 | 2 | float* inputPtr = (float*)src0; | |
368 | |||
369 | 2 | __m256 indexIncrementValues = _mm256_set1_ps(8); | |
370 | 2 | __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); | |
371 | |||
372 | 2 | float max = src0[0]; | |
373 | 2 | float index = 0; | |
374 | 2 | __m256 maxValues = _mm256_set1_ps(max); | |
375 | 2 | __m256 maxValuesIndex = _mm256_setzero_ps(); | |
376 | __m256 compareResults; | ||
377 | __m256 currentValues; | ||
378 | |||
379 | __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; | ||
380 | __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; | ||
381 | |||
382 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < quarterPoints; number++) { |
383 | 32766 | currentValues = _mm256_loadu_ps(inputPtr); | |
384 | 32766 | inputPtr += 8; | |
385 | 32766 | currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); | |
386 | 32766 | compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); | |
387 | maxValuesIndex = | ||
388 | 32766 | _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
389 | 32766 | maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); | |
390 | } | ||
391 | |||
392 | // Calculate the largest value from the remaining 8 points | ||
393 | _mm256_store_ps(maxValuesBuffer, maxValues); | ||
394 | _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
395 | |||
396 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (number = 0; number < 8; number++) { |
397 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.
|
16 | if (maxValuesBuffer[number] > max) { |
398 | 4 | index = maxIndexesBuffer[number]; | |
399 | 4 | max = maxValuesBuffer[number]; | |
400 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
|
12 | } else if (maxValuesBuffer[number] == max) { |
401 | ✗ | if (index > maxIndexesBuffer[number]) | |
402 | ✗ | index = maxIndexesBuffer[number]; | |
403 | } | ||
404 | } | ||
405 | |||
406 | 2 | number = quarterPoints * 8; | |
407 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
408 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (src0[number] > max) { |
409 | ✗ | index = number; | |
410 | ✗ | max = src0[number]; | |
411 | } | ||
412 | } | ||
413 | 2 | target[0] = (uint32_t)index; | |
414 | } | ||
415 | 2 | } | |
416 | |||
417 | #endif /*LV_HAVE_AVX*/ | ||
418 | |||
419 | |||
420 | #ifdef LV_HAVE_SSE4_1 | ||
421 | #include <smmintrin.h> | ||
422 | |||
423 | static inline void | ||
424 | 2 | volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) | |
425 | { | ||
426 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
427 | 2 | uint32_t number = 0; | |
428 | 2 | const uint32_t quarterPoints = num_points / 4; | |
429 | |||
430 | 2 | float* inputPtr = (float*)src0; | |
431 | |||
432 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
433 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
434 | |||
435 | 2 | float max = src0[0]; | |
436 | 2 | float index = 0; | |
437 | 2 | __m128 maxValues = _mm_set1_ps(max); | |
438 | 2 | __m128 maxValuesIndex = _mm_setzero_ps(); | |
439 | __m128 compareResults; | ||
440 | __m128 currentValues; | ||
441 | |||
442 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
443 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
444 | |||
445 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
446 | 65534 | currentValues = _mm_loadu_ps(inputPtr); | |
447 | 65534 | inputPtr += 4; | |
448 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
449 | 65534 | compareResults = _mm_cmpgt_ps(currentValues, maxValues); | |
450 | maxValuesIndex = | ||
451 | 65534 | _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); | |
452 | 65534 | maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); | |
453 | } | ||
454 | |||
455 | // Calculate the largest value from the remaining 4 points | ||
456 | _mm_store_ps(maxValuesBuffer, maxValues); | ||
457 | _mm_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
458 | |||
459 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; number++) { |
460 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (maxValuesBuffer[number] > max) { |
461 | 3 | index = maxIndexesBuffer[number]; | |
462 | 3 | max = maxValuesBuffer[number]; | |
463 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (maxValuesBuffer[number] == max) { |
464 | ✗ | if (index > maxIndexesBuffer[number]) | |
465 | ✗ | index = maxIndexesBuffer[number]; | |
466 | } | ||
467 | } | ||
468 | |||
469 | 2 | number = quarterPoints * 4; | |
470 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
471 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (src0[number] > max) { |
472 | ✗ | index = number; | |
473 | ✗ | max = src0[number]; | |
474 | } | ||
475 | } | ||
476 | 2 | target[0] = (uint32_t)index; | |
477 | } | ||
478 | 2 | } | |
479 | |||
480 | #endif /*LV_HAVE_SSE4_1*/ | ||
481 | |||
482 | #ifdef LV_HAVE_SSE | ||
483 | #include <xmmintrin.h> | ||
484 | |||
485 | static inline void | ||
486 | 2 | volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) | |
487 | { | ||
488 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
489 | 2 | uint32_t number = 0; | |
490 | 2 | const uint32_t quarterPoints = num_points / 4; | |
491 | |||
492 | 2 | float* inputPtr = (float*)src0; | |
493 | |||
494 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
495 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
496 | |||
497 | 2 | float max = src0[0]; | |
498 | 2 | float index = 0; | |
499 | 2 | __m128 maxValues = _mm_set1_ps(max); | |
500 | 2 | __m128 maxValuesIndex = _mm_setzero_ps(); | |
501 | __m128 compareResults; | ||
502 | __m128 currentValues; | ||
503 | |||
504 | __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; | ||
505 | __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; | ||
506 | |||
507 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
508 | 65534 | currentValues = _mm_loadu_ps(inputPtr); | |
509 | 65534 | inputPtr += 4; | |
510 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
511 | 65534 | compareResults = _mm_cmpgt_ps(currentValues, maxValues); | |
512 | 196602 | maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), | |
513 | _mm_andnot_ps(compareResults, maxValuesIndex)); | ||
514 | 196602 | maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), | |
515 | _mm_andnot_ps(compareResults, maxValues)); | ||
516 | } | ||
517 | |||
518 | // Calculate the largest value from the remaining 4 points | ||
519 | _mm_store_ps(maxValuesBuffer, maxValues); | ||
520 | _mm_store_ps(maxIndexesBuffer, maxValuesIndex); | ||
521 | |||
522 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; number++) { |
523 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (maxValuesBuffer[number] > max) { |
524 | 3 | index = maxIndexesBuffer[number]; | |
525 | 3 | max = maxValuesBuffer[number]; | |
526 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (maxValuesBuffer[number] == max) { |
527 | ✗ | if (index > maxIndexesBuffer[number]) | |
528 | ✗ | index = maxIndexesBuffer[number]; | |
529 | } | ||
530 | } | ||
531 | |||
532 | 2 | number = quarterPoints * 4; | |
533 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
534 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (src0[number] > max) { |
535 | ✗ | index = number; | |
536 | ✗ | max = src0[number]; | |
537 | } | ||
538 | } | ||
539 | 2 | target[0] = (uint32_t)index; | |
540 | } | ||
541 | 2 | } | |
542 | |||
543 | #endif /*LV_HAVE_SSE*/ | ||
544 | |||
545 | #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/ | ||
546 |