Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2021 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_index_min_32u | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Returns Argmin_i x[i]. Finds and returns the index which contains the first minimum | ||
16 | * value in the given vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_index_min_32u(uint32_t* target, const float* source, uint32_t num_points) | ||
21 | * \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li source: The input vector of floats. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li target: The index of the first minimum value in the input buffer. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10; | ||
33 | * uint32_t alignment = volk_get_alignment(); | ||
34 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
35 | * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment); | ||
36 | * | ||
37 | * for(uint32_t ii = 0; ii < N; ++ii){ | ||
38 | * float x = (float)ii; | ||
39 | * // a parabola with a minimum at x=4 | ||
40 | * in[ii] = (x-4) * (x-4) - 5; | ||
41 | * } | ||
42 | * | ||
43 | * volk_32f_index_min_32u(out, in, N); | ||
44 | * | ||
45 | * printf("minimum is %1.2f at index %u\n", in[*out], *out); | ||
46 | * | ||
47 | * volk_free(in); | ||
48 | * volk_free(out); | ||
49 | * \endcode | ||
50 | */ | ||
51 | |||
52 | #ifndef INCLUDED_volk_32f_index_min_32u_a_H | ||
53 | #define INCLUDED_volk_32f_index_min_32u_a_H | ||
54 | |||
55 | #include <inttypes.h> | ||
56 | #include <stdio.h> | ||
57 | #include <volk/volk_common.h> | ||
58 | |||
59 | #ifdef LV_HAVE_SSE4_1 | ||
60 | #include <smmintrin.h> | ||
61 | |||
62 | 2 | static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target, | |
63 | const float* source, | ||
64 | uint32_t num_points) | ||
65 | { | ||
66 | 2 | const uint32_t quarterPoints = num_points / 4; | |
67 | |||
68 | 2 | float* inputPtr = (float*)source; | |
69 | |||
70 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
71 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
72 | |||
73 | 2 | float min = source[0]; | |
74 | 2 | float index = 0; | |
75 | 2 | __m128 minValues = _mm_set1_ps(min); | |
76 | 2 | __m128 minValuesIndex = _mm_setzero_ps(); | |
77 | __m128 compareResults; | ||
78 | __m128 currentValues; | ||
79 | |||
80 | __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; | ||
81 | __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; | ||
82 | |||
83 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (uint32_t number = 0; number < quarterPoints; number++) { |
84 | |||
85 | 65534 | currentValues = _mm_load_ps(inputPtr); | |
86 | 65534 | inputPtr += 4; | |
87 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
88 | |||
89 | 65534 | compareResults = _mm_cmplt_ps(currentValues, minValues); | |
90 | |||
91 | 65534 | minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); | |
92 | 65534 | minValues = _mm_blendv_ps(minValues, currentValues, compareResults); | |
93 | } | ||
94 | |||
95 | // Calculate the smallest value from the remaining 4 points | ||
96 | _mm_store_ps(minValuesBuffer, minValues); | ||
97 | _mm_store_ps(minIndexesBuffer, minValuesIndex); | ||
98 | |||
99 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (uint32_t number = 0; number < 4; number++) { |
100 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (minValuesBuffer[number] < min) { |
101 | 3 | index = minIndexesBuffer[number]; | |
102 | 3 | min = minValuesBuffer[number]; | |
103 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (minValuesBuffer[number] == min) { |
104 | ✗ | if (index > minIndexesBuffer[number]) | |
105 | ✗ | index = minIndexesBuffer[number]; | |
106 | } | ||
107 | } | ||
108 | |||
109 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (uint32_t number = quarterPoints * 4; number < num_points; number++) { |
110 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (source[number] < min) { |
111 | ✗ | index = number; | |
112 | ✗ | min = source[number]; | |
113 | } | ||
114 | } | ||
115 | 2 | target[0] = (uint32_t)index; | |
116 | 2 | } | |
117 | |||
118 | #endif /*LV_HAVE_SSE4_1*/ | ||
119 | |||
120 | |||
121 | #ifdef LV_HAVE_SSE | ||
122 | |||
123 | #include <xmmintrin.h> | ||
124 | |||
125 | static inline void | ||
126 | 2 | volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points) | |
127 | { | ||
128 | 2 | const uint32_t quarterPoints = num_points / 4; | |
129 | |||
130 | 2 | float* inputPtr = (float*)source; | |
131 | |||
132 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
133 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
134 | |||
135 | 2 | float min = source[0]; | |
136 | 2 | float index = 0; | |
137 | 2 | __m128 minValues = _mm_set1_ps(min); | |
138 | 2 | __m128 minValuesIndex = _mm_setzero_ps(); | |
139 | __m128 compareResults; | ||
140 | __m128 currentValues; | ||
141 | |||
142 | __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; | ||
143 | __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; | ||
144 | |||
145 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (uint32_t number = 0; number < quarterPoints; number++) { |
146 | |||
147 | 65534 | currentValues = _mm_load_ps(inputPtr); | |
148 | 65534 | inputPtr += 4; | |
149 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
150 | |||
151 | 65534 | compareResults = _mm_cmplt_ps(currentValues, minValues); | |
152 | |||
153 | 196602 | minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), | |
154 | _mm_andnot_ps(compareResults, minValuesIndex)); | ||
155 | |||
156 | 196602 | minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), | |
157 | _mm_andnot_ps(compareResults, minValues)); | ||
158 | } | ||
159 | |||
160 | // Calculate the smallest value from the remaining 4 points | ||
161 | _mm_store_ps(minValuesBuffer, minValues); | ||
162 | _mm_store_ps(minIndexesBuffer, minValuesIndex); | ||
163 | |||
164 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (uint32_t number = 0; number < 4; number++) { |
165 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (minValuesBuffer[number] < min) { |
166 | 3 | index = minIndexesBuffer[number]; | |
167 | 3 | min = minValuesBuffer[number]; | |
168 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (minValuesBuffer[number] == min) { |
169 | ✗ | if (index > minIndexesBuffer[number]) | |
170 | ✗ | index = minIndexesBuffer[number]; | |
171 | } | ||
172 | } | ||
173 | |||
174 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (uint32_t number = quarterPoints * 4; number < num_points; number++) { |
175 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (source[number] < min) { |
176 | ✗ | index = number; | |
177 | ✗ | min = source[number]; | |
178 | } | ||
179 | } | ||
180 | 2 | target[0] = (uint32_t)index; | |
181 | 2 | } | |
182 | |||
183 | #endif /*LV_HAVE_SSE*/ | ||
184 | |||
185 | |||
186 | #ifdef LV_HAVE_AVX | ||
187 | #include <immintrin.h> | ||
188 | |||
189 | static inline void | ||
190 | 2 | volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points) | |
191 | { | ||
192 | 2 | const uint32_t quarterPoints = num_points / 8; | |
193 | |||
194 | 2 | float* inputPtr = (float*)source; | |
195 | |||
196 | 2 | __m256 indexIncrementValues = _mm256_set1_ps(8); | |
197 | 2 | __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); | |
198 | |||
199 | 2 | float min = source[0]; | |
200 | 2 | float index = 0; | |
201 | 2 | __m256 minValues = _mm256_set1_ps(min); | |
202 | 2 | __m256 minValuesIndex = _mm256_setzero_ps(); | |
203 | __m256 compareResults; | ||
204 | __m256 currentValues; | ||
205 | |||
206 | __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; | ||
207 | __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; | ||
208 | |||
209 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (uint32_t number = 0; number < quarterPoints; number++) { |
210 | 32766 | currentValues = _mm256_load_ps(inputPtr); | |
211 | 32766 | inputPtr += 8; | |
212 | 32766 | currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); | |
213 | 32766 | compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); | |
214 | 32766 | minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); | |
215 | 32766 | minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); | |
216 | } | ||
217 | |||
218 | // Calculate the smallest value from the remaining 8 points | ||
219 | _mm256_store_ps(minValuesBuffer, minValues); | ||
220 | _mm256_store_ps(minIndexesBuffer, minValuesIndex); | ||
221 | |||
222 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (uint32_t number = 0; number < 8; number++) { |
223 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 13 times.
|
16 | if (minValuesBuffer[number] < min) { |
224 | 3 | index = minIndexesBuffer[number]; | |
225 | 3 | min = minValuesBuffer[number]; | |
226 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | } else if (minValuesBuffer[number] == min) { |
227 | ✗ | if (index > minIndexesBuffer[number]) | |
228 | ✗ | index = minIndexesBuffer[number]; | |
229 | } | ||
230 | } | ||
231 | |||
232 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (uint32_t number = quarterPoints * 8; number < num_points; number++) { |
233 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (source[number] < min) { |
234 | ✗ | index = number; | |
235 | ✗ | min = source[number]; | |
236 | } | ||
237 | } | ||
238 | 2 | target[0] = (uint32_t)index; | |
239 | 2 | } | |
240 | |||
241 | #endif /*LV_HAVE_AVX*/ | ||
242 | |||
243 | |||
244 | #ifdef LV_HAVE_NEON | ||
245 | #include <arm_neon.h> | ||
246 | |||
247 | static inline void | ||
248 | volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points) | ||
249 | { | ||
250 | const uint32_t quarterPoints = num_points / 4; | ||
251 | |||
252 | float* inputPtr = (float*)source; | ||
253 | float32x4_t indexIncrementValues = vdupq_n_f32(4); | ||
254 | __VOLK_ATTR_ALIGNED(16) | ||
255 | float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; | ||
256 | float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); | ||
257 | |||
258 | float min = source[0]; | ||
259 | float index = 0; | ||
260 | float32x4_t minValues = vdupq_n_f32(min); | ||
261 | uint32x4_t minValuesIndex = vmovq_n_u32(0); | ||
262 | uint32x4_t compareResults; | ||
263 | uint32x4_t currentIndexes_u; | ||
264 | float32x4_t currentValues; | ||
265 | |||
266 | __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; | ||
267 | __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; | ||
268 | |||
269 | for (uint32_t number = 0; number < quarterPoints; number++) { | ||
270 | currentValues = vld1q_f32(inputPtr); | ||
271 | inputPtr += 4; | ||
272 | currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); | ||
273 | currentIndexes_u = vcvtq_u32_f32(currentIndexes); | ||
274 | compareResults = vcgeq_f32(currentValues, minValues); | ||
275 | minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex), | ||
276 | vbicq_u32(currentIndexes_u, compareResults)); | ||
277 | minValues = vminq_f32(currentValues, minValues); | ||
278 | } | ||
279 | |||
280 | // Calculate the smallest value from the remaining 4 points | ||
281 | vst1q_f32(minValuesBuffer, minValues); | ||
282 | vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); | ||
283 | for (uint32_t number = 0; number < 4; number++) { | ||
284 | if (minValuesBuffer[number] < min) { | ||
285 | index = minIndexesBuffer[number]; | ||
286 | min = minValuesBuffer[number]; | ||
287 | } else if (minValues[number] == min) { | ||
288 | if (index > minIndexesBuffer[number]) | ||
289 | index = minIndexesBuffer[number]; | ||
290 | } | ||
291 | } | ||
292 | |||
293 | for (uint32_t number = quarterPoints * 4; number < num_points; number++) { | ||
294 | if (source[number] < min) { | ||
295 | index = number; | ||
296 | min = source[number]; | ||
297 | } | ||
298 | } | ||
299 | target[0] = (uint32_t)index; | ||
300 | } | ||
301 | |||
302 | #endif /*LV_HAVE_NEON*/ | ||
303 | |||
304 | |||
305 | #ifdef LV_HAVE_GENERIC | ||
306 | |||
307 | static inline void | ||
308 | 2 | volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points) | |
309 | { | ||
310 | 2 | float min = source[0]; | |
311 | 2 | uint32_t index = 0; | |
312 | |||
313 |
2/2✓ Branch 0 taken 262140 times.
✓ Branch 1 taken 2 times.
|
262142 | for (uint32_t i = 1; i < num_points; ++i) { |
314 |
2/2✓ Branch 0 taken 27 times.
✓ Branch 1 taken 262113 times.
|
262140 | if (source[i] < min) { |
315 | 27 | index = i; | |
316 | 27 | min = source[i]; | |
317 | } | ||
318 | } | ||
319 | 2 | target[0] = index; | |
320 | 2 | } | |
321 | |||
322 | #endif /*LV_HAVE_GENERIC*/ | ||
323 | |||
324 | |||
325 | #endif /*INCLUDED_volk_32f_index_min_32u_a_H*/ | ||
326 | |||
327 | |||
328 | #ifndef INCLUDED_volk_32f_index_min_32u_u_H | ||
329 | #define INCLUDED_volk_32f_index_min_32u_u_H | ||
330 | |||
331 | #include <inttypes.h> | ||
332 | #include <stdio.h> | ||
333 | #include <volk/volk_common.h> | ||
334 | |||
335 | |||
336 | #ifdef LV_HAVE_AVX | ||
337 | #include <immintrin.h> | ||
338 | |||
339 | static inline void | ||
340 | 2 | volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points) | |
341 | { | ||
342 | 2 | const uint32_t quarterPoints = num_points / 8; | |
343 | |||
344 | 2 | float* inputPtr = (float*)source; | |
345 | |||
346 | 2 | __m256 indexIncrementValues = _mm256_set1_ps(8); | |
347 | 2 | __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); | |
348 | |||
349 | 2 | float min = source[0]; | |
350 | 2 | float index = 0; | |
351 | 2 | __m256 minValues = _mm256_set1_ps(min); | |
352 | 2 | __m256 minValuesIndex = _mm256_setzero_ps(); | |
353 | __m256 compareResults; | ||
354 | __m256 currentValues; | ||
355 | |||
356 | __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; | ||
357 | __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; | ||
358 | |||
359 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (uint32_t number = 0; number < quarterPoints; number++) { |
360 | 32766 | currentValues = _mm256_loadu_ps(inputPtr); | |
361 | 32766 | inputPtr += 8; | |
362 | 32766 | currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); | |
363 | 32766 | compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); | |
364 | 32766 | minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); | |
365 | 32766 | minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); | |
366 | } | ||
367 | |||
368 | // Calculate the smalles value from the remaining 8 points | ||
369 | _mm256_store_ps(minValuesBuffer, minValues); | ||
370 | _mm256_store_ps(minIndexesBuffer, minValuesIndex); | ||
371 | |||
372 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (uint32_t number = 0; number < 8; number++) { |
373 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 13 times.
|
16 | if (minValuesBuffer[number] < min) { |
374 | 3 | index = minIndexesBuffer[number]; | |
375 | 3 | min = minValuesBuffer[number]; | |
376 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | } else if (minValuesBuffer[number] == min) { |
377 | ✗ | if (index > minIndexesBuffer[number]) | |
378 | ✗ | index = minIndexesBuffer[number]; | |
379 | } | ||
380 | } | ||
381 | |||
382 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (uint32_t number = quarterPoints * 8; number < num_points; number++) { |
383 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | if (source[number] < min) { |
384 | ✗ | index = number; | |
385 | ✗ | min = source[number]; | |
386 | } | ||
387 | } | ||
388 | 2 | target[0] = (uint32_t)index; | |
389 | 2 | } | |
390 | |||
391 | #endif /*LV_HAVE_AVX*/ | ||
392 | |||
393 | |||
394 | #ifdef LV_HAVE_SSE4_1 | ||
395 | #include <smmintrin.h> | ||
396 | |||
397 | 2 | static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target, | |
398 | const float* source, | ||
399 | uint32_t num_points) | ||
400 | { | ||
401 | 2 | const uint32_t quarterPoints = num_points / 4; | |
402 | |||
403 | 2 | float* inputPtr = (float*)source; | |
404 | |||
405 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
406 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
407 | |||
408 | 2 | float min = source[0]; | |
409 | 2 | float index = 0; | |
410 | 2 | __m128 minValues = _mm_set1_ps(min); | |
411 | 2 | __m128 minValuesIndex = _mm_setzero_ps(); | |
412 | __m128 compareResults; | ||
413 | __m128 currentValues; | ||
414 | |||
415 | __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; | ||
416 | __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; | ||
417 | |||
418 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (uint32_t number = 0; number < quarterPoints; number++) { |
419 | 65534 | currentValues = _mm_loadu_ps(inputPtr); | |
420 | 65534 | inputPtr += 4; | |
421 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
422 | 65534 | compareResults = _mm_cmplt_ps(currentValues, minValues); | |
423 | 65534 | minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); | |
424 | 65534 | minValues = _mm_blendv_ps(minValues, currentValues, compareResults); | |
425 | } | ||
426 | |||
427 | // Calculate the smallest value from the remaining 4 points | ||
428 | _mm_store_ps(minValuesBuffer, minValues); | ||
429 | _mm_store_ps(minIndexesBuffer, minValuesIndex); | ||
430 | |||
431 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (uint32_t number = 0; number < 4; number++) { |
432 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (minValuesBuffer[number] < min) { |
433 | 3 | index = minIndexesBuffer[number]; | |
434 | 3 | min = minValuesBuffer[number]; | |
435 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (minValuesBuffer[number] == min) { |
436 | ✗ | if (index > minIndexesBuffer[number]) | |
437 | ✗ | index = minIndexesBuffer[number]; | |
438 | } | ||
439 | } | ||
440 | |||
441 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (uint32_t number = quarterPoints * 4; number < num_points; number++) { |
442 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (source[number] < min) { |
443 | ✗ | index = number; | |
444 | ✗ | min = source[number]; | |
445 | } | ||
446 | } | ||
447 | 2 | target[0] = (uint32_t)index; | |
448 | 2 | } | |
449 | |||
450 | #endif /*LV_HAVE_SSE4_1*/ | ||
451 | |||
452 | #ifdef LV_HAVE_SSE | ||
453 | #include <xmmintrin.h> | ||
454 | |||
455 | static inline void | ||
456 | 2 | volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points) | |
457 | { | ||
458 | 2 | const uint32_t quarterPoints = num_points / 4; | |
459 | |||
460 | 2 | float* inputPtr = (float*)source; | |
461 | |||
462 | 2 | __m128 indexIncrementValues = _mm_set1_ps(4); | |
463 | 2 | __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); | |
464 | |||
465 | 2 | float min = source[0]; | |
466 | 2 | float index = 0; | |
467 | 2 | __m128 minValues = _mm_set1_ps(min); | |
468 | 2 | __m128 minValuesIndex = _mm_setzero_ps(); | |
469 | __m128 compareResults; | ||
470 | __m128 currentValues; | ||
471 | |||
472 | __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; | ||
473 | __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; | ||
474 | |||
475 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (uint32_t number = 0; number < quarterPoints; number++) { |
476 | 65534 | currentValues = _mm_loadu_ps(inputPtr); | |
477 | 65534 | inputPtr += 4; | |
478 | 65534 | currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); | |
479 | 65534 | compareResults = _mm_cmplt_ps(currentValues, minValues); | |
480 | 196602 | minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), | |
481 | _mm_andnot_ps(compareResults, minValuesIndex)); | ||
482 | 196602 | minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), | |
483 | _mm_andnot_ps(compareResults, minValues)); | ||
484 | } | ||
485 | |||
486 | // Calculate the smallest value from the remaining 4 points | ||
487 | _mm_store_ps(minValuesBuffer, minValues); | ||
488 | _mm_store_ps(minIndexesBuffer, minValuesIndex); | ||
489 | |||
490 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (uint32_t number = 0; number < 4; number++) { |
491 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if (minValuesBuffer[number] < min) { |
492 | 3 | index = minIndexesBuffer[number]; | |
493 | 3 | min = minValuesBuffer[number]; | |
494 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | } else if (minValuesBuffer[number] == min) { |
495 | ✗ | if (index > minIndexesBuffer[number]) | |
496 | ✗ | index = minIndexesBuffer[number]; | |
497 | } | ||
498 | } | ||
499 | |||
500 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (uint32_t number = quarterPoints * 4; number < num_points; number++) { |
501 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (source[number] < min) { |
502 | ✗ | index = number; | |
503 | ✗ | min = source[number]; | |
504 | } | ||
505 | } | ||
506 | 2 | target[0] = (uint32_t)index; | |
507 | 2 | } | |
508 | |||
509 | #endif /*LV_HAVE_SSE*/ | ||
510 | |||
511 | #endif /*INCLUDED_volk_32f_index_min_32u_u_H*/ | ||
512 |