Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014, 2021 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_stddev_and_mean_32f_x2 | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the standard deviation and mean of the input buffer by means of | ||
16 | * Youngs and Cramer's Algorithm | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* | ||
21 | * inputBuffer, unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li inputBuffer: The buffer of points. | ||
25 | * \li num_points The number of values in input buffer. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li stddev: The calculated standard deviation. | ||
29 | * \li mean: The mean of the input buffer. | ||
30 | * | ||
31 | * \b Example | ||
32 | * Generate random numbers with c++11's normal distribution and estimate the mean and | ||
33 | * standard deviation | ||
34 | * \code | ||
35 | * int N = 1000; | ||
36 | * unsigned int alignment = volk_get_alignment(); | ||
37 | * float* rand_numbers = (float*) volk_malloc(sizeof(float)*N, alignment); | ||
38 | * float* mean = (float*) volk_malloc(sizeof(float), alignment); | ||
39 | * float* stddev = (float*) volk_malloc(sizeof(float), alignment); | ||
40 | * | ||
41 | * // Use a normal generator with 0 mean, stddev 1000 | ||
42 | * std::default_random_engine generator; | ||
43 | * std::normal_distribution<float> distribution(0, 1000); | ||
44 | * | ||
45 | * for(unsigned int ii = 0; ii < N; ++ii) { | ||
46 | * rand_numbers[ii] = distribution(generator); | ||
47 | * } | ||
48 | * | ||
49 | * volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N); | ||
50 | * | ||
51 | * printf("std. dev. = %f\n", *stddev); | ||
52 | * printf("mean = %f\n", *mean); | ||
53 | * | ||
54 | * volk_free(rand_numbers); | ||
55 | * volk_free(mean); | ||
56 | * volk_free(stddev); | ||
57 | * \endcode | ||
58 | */ | ||
59 | |||
60 | #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H | ||
61 | #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H | ||
62 | |||
63 | #include <inttypes.h> | ||
64 | #include <math.h> | ||
65 | #include <volk/volk_common.h> | ||
66 | |||
67 | // Youngs and Cramer's Algorithm for calculating std and mean | ||
68 | // Using the methods discussed here: | ||
69 | // https://doi.org/10.1145/3221269.3223036 | ||
70 | #ifdef LV_HAVE_GENERIC | ||
71 | |||
72 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, | |
73 | float* mean, | ||
74 | const float* inputBuffer, | ||
75 | unsigned int num_points) | ||
76 | { | ||
77 | 2 | const float* in_ptr = inputBuffer; | |
78 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points == 0) { |
79 | ✗ | return; | |
80 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | } else if (num_points == 1) { |
81 | ✗ | *stddev = 0.f; | |
82 | ✗ | *mean = (*in_ptr); | |
83 | ✗ | return; | |
84 | } | ||
85 | |||
86 | float Sum[2]; | ||
87 | 2 | float SquareSum[2] = { 0.f, 0.f }; | |
88 | 2 | Sum[0] = (*in_ptr++); | |
89 | 2 | Sum[1] = (*in_ptr++); | |
90 | |||
91 | 2 | uint32_t half_points = num_points / 2; | |
92 | |||
93 |
2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.
|
131070 | for (uint32_t number = 1; number < half_points; number++) { |
94 | 131068 | float Val0 = (*in_ptr++); | |
95 | 131068 | float Val1 = (*in_ptr++); | |
96 | 131068 | float n = (float)number; | |
97 | 131068 | float n_plus_one = n + 1.f; | |
98 | 131068 | float r = 1.f / (n * n_plus_one); | |
99 | |||
100 | 131068 | Sum[0] += Val0; | |
101 | 131068 | Sum[1] += Val1; | |
102 | |||
103 | 131068 | SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2); | |
104 | 131068 | SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2); | |
105 | } | ||
106 | |||
107 | 2 | SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2); | |
108 | 2 | Sum[0] += Sum[1]; | |
109 | |||
110 | 2 | uint32_t points_done = half_points * 2; | |
111 | |||
112 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; points_done < num_points; points_done++) { |
113 | 2 | float Val = (*in_ptr++); | |
114 | 2 | float n = (float)points_done; | |
115 | 2 | float n_plus_one = n + 1.f; | |
116 | 2 | float r = 1.f / (n * n_plus_one); | |
117 | 2 | Sum[0] += Val; | |
118 | 2 | SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2); | |
119 | } | ||
120 | 2 | *stddev = sqrtf(SquareSum[0] / num_points); | |
121 | 2 | *mean = Sum[0] / num_points; | |
122 | } | ||
123 | #endif /* LV_HAVE_GENERIC */ | ||
124 | |||
125 | 88 | static inline float update_square_sum_1_val(const float SquareSum, | |
126 | const float Sum, | ||
127 | const uint32_t len, | ||
128 | const float val) | ||
129 | { | ||
130 | // Updates a sum of squares calculated over len values with the value val | ||
131 | 88 | float n = (float)len; | |
132 | 88 | float n_plus_one = n + 1.f; | |
133 | 176 | return SquareSum + | |
134 | 88 | 1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum); | |
135 | } | ||
136 | |||
137 | 88 | static inline float add_square_sums(const float SquareSum0, | |
138 | const float Sum0, | ||
139 | const float SquareSum1, | ||
140 | const float Sum1, | ||
141 | const uint32_t len) | ||
142 | { | ||
143 | // Add two sums of squares calculated over the same number of values, len | ||
144 | 88 | float n = (float)len; | |
145 | 88 | return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1); | |
146 | } | ||
147 | |||
148 | 8 | static inline void accrue_result(float* PartialSquareSums, | |
149 | float* PartialSums, | ||
150 | const uint32_t NumberOfPartitions, | ||
151 | const uint32_t PartitionLen) | ||
152 | { | ||
153 | // Add all partial sums and square sums into the first element of the arrays | ||
154 | 8 | uint32_t accumulators = NumberOfPartitions; | |
155 | 8 | uint32_t stages = 0; | |
156 | 8 | uint32_t offset = 1; | |
157 | 8 | uint32_t partition_len = PartitionLen; | |
158 | |||
159 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
|
36 | while (accumulators >>= 1) { |
160 | 28 | stages++; | |
161 | } // Integer log2 | ||
162 | 8 | accumulators = NumberOfPartitions; | |
163 | |||
164 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
|
36 | for (uint32_t s = 0; s < stages; s++) { |
165 | 28 | accumulators /= 2; | |
166 | 28 | uint32_t idx = 0; | |
167 |
2/2✓ Branch 0 taken 88 times.
✓ Branch 1 taken 28 times.
|
116 | for (uint32_t a = 0; a < accumulators; a++) { |
168 | 176 | PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx], | |
169 | 88 | PartialSums[idx], | |
170 | 88 | PartialSquareSums[idx + offset], | |
171 | 88 | PartialSums[idx + offset], | |
172 | partition_len); | ||
173 | 88 | PartialSums[idx] += PartialSums[idx + offset]; | |
174 | 88 | idx += 2 * offset; | |
175 | } | ||
176 | 28 | offset *= 2; | |
177 | 28 | partition_len *= 2; | |
178 | } | ||
179 | 8 | } | |
180 | |||
181 | #ifdef LV_HAVE_NEON | ||
182 | #include <arm_neon.h> | ||
183 | #include <volk/volk_neon_intrinsics.h> | ||
184 | |||
185 | static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev, | ||
186 | float* mean, | ||
187 | const float* inputBuffer, | ||
188 | unsigned int num_points) | ||
189 | { | ||
190 | if (num_points < 8) { | ||
191 | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | ||
192 | return; | ||
193 | } | ||
194 | |||
195 | const float* in_ptr = inputBuffer; | ||
196 | |||
197 | __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f }; | ||
198 | __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f }; | ||
199 | |||
200 | const uint32_t eigth_points = num_points / 8; | ||
201 | |||
202 | float32x4_t Sum0, Sum1; | ||
203 | |||
204 | Sum0 = vld1q_f32((const float32_t*)in_ptr); | ||
205 | in_ptr += 4; | ||
206 | __VOLK_PREFETCH(in_ptr + 4); | ||
207 | |||
208 | Sum1 = vld1q_f32((const float32_t*)in_ptr); | ||
209 | in_ptr += 4; | ||
210 | __VOLK_PREFETCH(in_ptr + 4); | ||
211 | |||
212 | float32x4_t SquareSum0 = { 0.f }; | ||
213 | float32x4_t SquareSum1 = { 0.f }; | ||
214 | |||
215 | float32x4_t Values0, Values1; | ||
216 | float32x4_t Aux0, Aux1; | ||
217 | float32x4_t Reciprocal; | ||
218 | |||
219 | for (uint32_t number = 1; number < eigth_points; number++) { | ||
220 | Values0 = vld1q_f32(in_ptr); | ||
221 | in_ptr += 4; | ||
222 | __VOLK_PREFETCH(in_ptr + 4); | ||
223 | |||
224 | Values1 = vld1q_f32(in_ptr); | ||
225 | in_ptr += 4; | ||
226 | __VOLK_PREFETCH(in_ptr + 4); | ||
227 | |||
228 | float n = (float)number; | ||
229 | float n_plus_one = n + 1.f; | ||
230 | Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one)); | ||
231 | |||
232 | Sum0 = vaddq_f32(Sum0, Values0); | ||
233 | Aux0 = vdupq_n_f32(n_plus_one); | ||
234 | SquareSum0 = | ||
235 | _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | ||
236 | |||
237 | Sum1 = vaddq_f32(Sum1, Values1); | ||
238 | Aux1 = vdupq_n_f32(n_plus_one); | ||
239 | SquareSum1 = | ||
240 | _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | ||
241 | } | ||
242 | |||
243 | vst1q_f32(&SumLocal[0], Sum0); | ||
244 | vst1q_f32(&SumLocal[4], Sum1); | ||
245 | vst1q_f32(&SquareSumLocal[0], SquareSum0); | ||
246 | vst1q_f32(&SquareSumLocal[4], SquareSum1); | ||
247 | |||
248 | accrue_result(SquareSumLocal, SumLocal, 8, eigth_points); | ||
249 | |||
250 | uint32_t points_done = eigth_points * 8; | ||
251 | |||
252 | for (; points_done < num_points; points_done++) { | ||
253 | float val = (*in_ptr++); | ||
254 | SumLocal[0] += val; | ||
255 | SquareSumLocal[0] = | ||
256 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | ||
257 | } | ||
258 | |||
259 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | ||
260 | *mean = SumLocal[0] / num_points; | ||
261 | } | ||
262 | #endif /* LV_HAVE_NEON */ | ||
263 | |||
264 | #ifdef LV_HAVE_SSE | ||
265 | #include <volk/volk_sse_intrinsics.h> | ||
266 | #include <xmmintrin.h> | ||
267 | |||
268 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev, | |
269 | float* mean, | ||
270 | const float* inputBuffer, | ||
271 | unsigned int num_points) | ||
272 | { | ||
273 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 8) { |
274 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
275 | ✗ | return; | |
276 | } | ||
277 | |||
278 | 2 | const float* in_ptr = inputBuffer; | |
279 | |||
280 | 2 | __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f }; | |
281 | 2 | __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f }; | |
282 | |||
283 | |||
284 | 2 | const uint32_t eigth_points = num_points / 8; | |
285 | |||
286 | 2 | __m128 Sum0 = _mm_loadu_ps(in_ptr); | |
287 | 2 | in_ptr += 4; | |
288 | 2 | __m128 Sum1 = _mm_loadu_ps(in_ptr); | |
289 | 2 | in_ptr += 4; | |
290 | 2 | __m128 SquareSum0 = _mm_setzero_ps(); | |
291 | 2 | __m128 SquareSum1 = _mm_setzero_ps(); | |
292 | __m128 Values0, Values1; | ||
293 | __m128 Aux0, Aux1; | ||
294 | __m128 Reciprocal; | ||
295 | |||
296 |
2/2✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.
|
32766 | for (uint32_t number = 1; number < eigth_points; number++) { |
297 | 32764 | Values0 = _mm_loadu_ps(in_ptr); | |
298 | 32764 | in_ptr += 4; | |
299 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
300 | |||
301 | 32764 | Values1 = _mm_loadu_ps(in_ptr); | |
302 | 32764 | in_ptr += 4; | |
303 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
304 | |||
305 | 32764 | float n = (float)number; | |
306 | 32764 | float n_plus_one = n + 1.f; | |
307 | 65528 | Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one)); | |
308 | |||
309 | 32764 | Sum0 = _mm_add_ps(Sum0, Values0); | |
310 | 32764 | Aux0 = _mm_set_ps1(n_plus_one); | |
311 | SquareSum0 = | ||
312 | 32764 | _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
313 | |||
314 | 32764 | Sum1 = _mm_add_ps(Sum1, Values1); | |
315 | 32764 | Aux1 = _mm_set_ps1(n_plus_one); | |
316 | SquareSum1 = | ||
317 | 32764 | _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
318 | } | ||
319 | |||
320 | _mm_store_ps(&SumLocal[0], Sum0); | ||
321 | _mm_store_ps(&SumLocal[4], Sum1); | ||
322 | _mm_store_ps(&SquareSumLocal[0], SquareSum0); | ||
323 | _mm_store_ps(&SquareSumLocal[4], SquareSum1); | ||
324 | |||
325 | 2 | accrue_result(SquareSumLocal, SumLocal, 8, eigth_points); | |
326 | |||
327 | 2 | uint32_t points_done = eigth_points * 8; | |
328 | |||
329 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; points_done < num_points; points_done++) { |
330 | 14 | float val = (*in_ptr++); | |
331 | 14 | SumLocal[0] += val; | |
332 | 14 | SquareSumLocal[0] = | |
333 | 14 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
334 | } | ||
335 | |||
336 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
337 | 2 | *mean = SumLocal[0] / num_points; | |
338 | } | ||
339 | #endif /* LV_HAVE_SSE */ | ||
340 | |||
341 | #ifdef LV_HAVE_AVX | ||
342 | #include <immintrin.h> | ||
343 | #include <volk/volk_avx_intrinsics.h> | ||
344 | |||
345 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, | |
346 | float* mean, | ||
347 | const float* inputBuffer, | ||
348 | unsigned int num_points) | ||
349 | { | ||
350 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 16) { |
351 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
352 | ✗ | return; | |
353 | } | ||
354 | |||
355 | 2 | const float* in_ptr = inputBuffer; | |
356 | |||
357 | 2 | __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f }; | |
358 | 2 | __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f }; | |
359 | |||
360 | 2 | const unsigned int sixteenth_points = num_points / 16; | |
361 | |||
362 | 2 | __m256 Sum0 = _mm256_loadu_ps(in_ptr); | |
363 | 2 | in_ptr += 8; | |
364 | 2 | __m256 Sum1 = _mm256_loadu_ps(in_ptr); | |
365 | 2 | in_ptr += 8; | |
366 | |||
367 | 2 | __m256 SquareSum0 = _mm256_setzero_ps(); | |
368 | 2 | __m256 SquareSum1 = _mm256_setzero_ps(); | |
369 | __m256 Values0, Values1; | ||
370 | __m256 Aux0, Aux1; | ||
371 | __m256 Reciprocal; | ||
372 | |||
373 |
2/2✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.
|
16382 | for (uint32_t number = 1; number < sixteenth_points; number++) { |
374 | 16380 | Values0 = _mm256_loadu_ps(in_ptr); | |
375 | 16380 | in_ptr += 8; | |
376 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
377 | |||
378 | 16380 | Values1 = _mm256_loadu_ps(in_ptr); | |
379 | 16380 | in_ptr += 8; | |
380 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
381 | |||
382 | 16380 | float n = (float)number; | |
383 | 16380 | float n_plus_one = n + 1.f; | |
384 | |||
385 | 32760 | Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one)); | |
386 | |||
387 | 16380 | Sum0 = _mm256_add_ps(Sum0, Values0); | |
388 | 16380 | Aux0 = _mm256_set1_ps(n_plus_one); | |
389 | SquareSum0 = | ||
390 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
391 | |||
392 | 16380 | Sum1 = _mm256_add_ps(Sum1, Values1); | |
393 | 16380 | Aux1 = _mm256_set1_ps(n_plus_one); | |
394 | SquareSum1 = | ||
395 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
396 | } | ||
397 | |||
398 | _mm256_store_ps(&SumLocal[0], Sum0); | ||
399 | _mm256_store_ps(&SumLocal[8], Sum1); | ||
400 | _mm256_store_ps(&SquareSumLocal[0], SquareSum0); | ||
401 | _mm256_store_ps(&SquareSumLocal[8], SquareSum1); | ||
402 | |||
403 | 2 | accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points); | |
404 | |||
405 | 2 | uint32_t points_done = sixteenth_points * 16; | |
406 | |||
407 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; points_done < num_points; points_done++) { |
408 | 30 | float val = (*in_ptr++); | |
409 | 30 | SumLocal[0] += val; | |
410 | 30 | SquareSumLocal[0] = | |
411 | 30 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
412 | } | ||
413 | |||
414 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
415 | 2 | *mean = SumLocal[0] / num_points; | |
416 | } | ||
417 | #endif /* LV_HAVE_AVX */ | ||
418 | |||
419 | #ifdef LV_HAVE_SSE | ||
420 | #include <xmmintrin.h> | ||
421 | |||
422 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, | |
423 | float* mean, | ||
424 | const float* inputBuffer, | ||
425 | unsigned int num_points) | ||
426 | { | ||
427 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 8) { |
428 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
429 | ✗ | return; | |
430 | } | ||
431 | |||
432 | 2 | const float* in_ptr = inputBuffer; | |
433 | |||
434 | 2 | __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f }; | |
435 | 2 | __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f }; | |
436 | |||
437 | |||
438 | 2 | const uint32_t eigth_points = num_points / 8; | |
439 | |||
440 | 2 | __m128 Sum0 = _mm_load_ps(in_ptr); | |
441 | 2 | in_ptr += 4; | |
442 | 2 | __m128 Sum1 = _mm_load_ps(in_ptr); | |
443 | 2 | in_ptr += 4; | |
444 | 2 | __m128 SquareSum0 = _mm_setzero_ps(); | |
445 | 2 | __m128 SquareSum1 = _mm_setzero_ps(); | |
446 | __m128 Values0, Values1; | ||
447 | __m128 Aux0, Aux1; | ||
448 | __m128 Reciprocal; | ||
449 | |||
450 |
2/2✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.
|
32766 | for (uint32_t number = 1; number < eigth_points; number++) { |
451 | 32764 | Values0 = _mm_load_ps(in_ptr); | |
452 | 32764 | in_ptr += 4; | |
453 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
454 | |||
455 | 32764 | Values1 = _mm_load_ps(in_ptr); | |
456 | 32764 | in_ptr += 4; | |
457 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
458 | |||
459 | 32764 | float n = (float)number; | |
460 | 32764 | float n_plus_one = n + 1.f; | |
461 | 65528 | Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one)); | |
462 | |||
463 | 32764 | Sum0 = _mm_add_ps(Sum0, Values0); | |
464 | 32764 | Aux0 = _mm_set_ps1(n_plus_one); | |
465 | SquareSum0 = | ||
466 | 32764 | _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
467 | |||
468 | 32764 | Sum1 = _mm_add_ps(Sum1, Values1); | |
469 | 32764 | Aux1 = _mm_set_ps1(n_plus_one); | |
470 | SquareSum1 = | ||
471 | 32764 | _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
472 | } | ||
473 | |||
474 | _mm_store_ps(&SumLocal[0], Sum0); | ||
475 | _mm_store_ps(&SumLocal[4], Sum1); | ||
476 | _mm_store_ps(&SquareSumLocal[0], SquareSum0); | ||
477 | _mm_store_ps(&SquareSumLocal[4], SquareSum1); | ||
478 | |||
479 | 2 | accrue_result(SquareSumLocal, SumLocal, 8, eigth_points); | |
480 | |||
481 | 2 | uint32_t points_done = eigth_points * 8; | |
482 | |||
483 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; points_done < num_points; points_done++) { |
484 | 14 | float val = (*in_ptr++); | |
485 | 14 | SumLocal[0] += val; | |
486 | 14 | SquareSumLocal[0] = | |
487 | 14 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
488 | } | ||
489 | |||
490 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
491 | 2 | *mean = SumLocal[0] / num_points; | |
492 | } | ||
493 | #endif /* LV_HAVE_SSE */ | ||
494 | |||
495 | #ifdef LV_HAVE_AVX | ||
496 | #include <immintrin.h> | ||
497 | |||
498 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, | |
499 | float* mean, | ||
500 | const float* inputBuffer, | ||
501 | unsigned int num_points) | ||
502 | { | ||
503 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 16) { |
504 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
505 | ✗ | return; | |
506 | } | ||
507 | |||
508 | 2 | const float* in_ptr = inputBuffer; | |
509 | |||
510 | 2 | __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f }; | |
511 | 2 | __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f }; | |
512 | |||
513 | 2 | const unsigned int sixteenth_points = num_points / 16; | |
514 | |||
515 | 2 | __m256 Sum0 = _mm256_load_ps(in_ptr); | |
516 | 2 | in_ptr += 8; | |
517 | 2 | __m256 Sum1 = _mm256_load_ps(in_ptr); | |
518 | 2 | in_ptr += 8; | |
519 | |||
520 | 2 | __m256 SquareSum0 = _mm256_setzero_ps(); | |
521 | 2 | __m256 SquareSum1 = _mm256_setzero_ps(); | |
522 | __m256 Values0, Values1; | ||
523 | __m256 Aux0, Aux1; | ||
524 | __m256 Reciprocal; | ||
525 | |||
526 |
2/2✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.
|
16382 | for (uint32_t number = 1; number < sixteenth_points; number++) { |
527 | 16380 | Values0 = _mm256_load_ps(in_ptr); | |
528 | 16380 | in_ptr += 8; | |
529 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
530 | |||
531 | 16380 | Values1 = _mm256_load_ps(in_ptr); | |
532 | 16380 | in_ptr += 8; | |
533 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
534 | |||
535 | 16380 | float n = (float)number; | |
536 | 16380 | float n_plus_one = n + 1.f; | |
537 | |||
538 | 32760 | Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one)); | |
539 | |||
540 | 16380 | Sum0 = _mm256_add_ps(Sum0, Values0); | |
541 | 16380 | Aux0 = _mm256_set1_ps(n_plus_one); | |
542 | SquareSum0 = | ||
543 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
544 | |||
545 | 16380 | Sum1 = _mm256_add_ps(Sum1, Values1); | |
546 | 16380 | Aux1 = _mm256_set1_ps(n_plus_one); | |
547 | SquareSum1 = | ||
548 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
549 | } | ||
550 | |||
551 | _mm256_store_ps(&SumLocal[0], Sum0); | ||
552 | _mm256_store_ps(&SumLocal[8], Sum1); | ||
553 | _mm256_store_ps(&SquareSumLocal[0], SquareSum0); | ||
554 | _mm256_store_ps(&SquareSumLocal[8], SquareSum1); | ||
555 | |||
556 | 2 | accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points); | |
557 | |||
558 | 2 | uint32_t points_done = sixteenth_points * 16; | |
559 | |||
560 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; points_done < num_points; points_done++) { |
561 | 30 | float val = (*in_ptr++); | |
562 | 30 | SumLocal[0] += val; | |
563 | 30 | SquareSumLocal[0] = | |
564 | 30 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
565 | } | ||
566 | |||
567 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
568 | 2 | *mean = SumLocal[0] / num_points; | |
569 | } | ||
570 | #endif /* LV_HAVE_AVX */ | ||
571 | |||
572 | #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ | ||
573 |