GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 207 219 94.5%
Functions: 8 8 100.0%
Branches: 32 38 84.2%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_stddev_and_mean_32f_x2
12 *
13 * \b Overview
14 *
15 * Computes the standard deviation and mean of the input buffer by means of
16 * Youngs and Cramer's Algorithm
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
21 * inputBuffer, unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li inputBuffer: The buffer of points.
25 * \li num_points The number of values in input buffer.
26 *
27 * \b Outputs
28 * \li stddev: The calculated standard deviation.
29 * \li mean: The mean of the input buffer.
30 *
31 * \b Example
32 * Generate random numbers with c++11's normal distribution and estimate the mean and
33 * standard deviation
34 * \code
35 * int N = 1000;
36 * unsigned int alignment = volk_get_alignment();
37 * float* rand_numbers = (float*) volk_malloc(sizeof(float)*N, alignment);
38 * float* mean = (float*) volk_malloc(sizeof(float), alignment);
39 * float* stddev = (float*) volk_malloc(sizeof(float), alignment);
40 *
41 * // Use a normal generator with 0 mean, stddev 1000
42 * std::default_random_engine generator;
43 * std::normal_distribution<float> distribution(0, 1000);
44 *
45 * for(unsigned int ii = 0; ii < N; ++ii) {
46 * rand_numbers[ii] = distribution(generator);
47 * }
48 *
49 * volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N);
50 *
51 * printf("std. dev. = %f\n", *stddev);
52 * printf("mean = %f\n", *mean);
53 *
54 * volk_free(rand_numbers);
55 * volk_free(mean);
56 * volk_free(stddev);
57 * \endcode
58 */
59
60 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
61 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
62
63 #include <inttypes.h>
64 #include <math.h>
65 #include <volk/volk_common.h>
66
67 // Youngs and Cramer's Algorithm for calculating std and mean
68 // Using the methods discussed here:
69 // https://doi.org/10.1145/3221269.3223036
70 #ifdef LV_HAVE_GENERIC
71
72 2 static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
73 float* mean,
74 const float* inputBuffer,
75 unsigned int num_points)
76 {
77 2 const float* in_ptr = inputBuffer;
78
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (num_points == 0) {
79 return;
80
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 } else if (num_points == 1) {
81 *stddev = 0.f;
82 *mean = (*in_ptr);
83 return;
84 }
85
86 float Sum[2];
87 2 float SquareSum[2] = { 0.f, 0.f };
88 2 Sum[0] = (*in_ptr++);
89 2 Sum[1] = (*in_ptr++);
90
91 2 uint32_t half_points = num_points / 2;
92
93
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.
131070 for (uint32_t number = 1; number < half_points; number++) {
94 131068 float Val0 = (*in_ptr++);
95 131068 float Val1 = (*in_ptr++);
96 131068 float n = (float)number;
97 131068 float n_plus_one = n + 1.f;
98 131068 float r = 1.f / (n * n_plus_one);
99
100 131068 Sum[0] += Val0;
101 131068 Sum[1] += Val1;
102
103 131068 SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
104 131068 SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
105 }
106
107 2 SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
108 2 Sum[0] += Sum[1];
109
110 2 uint32_t points_done = half_points * 2;
111
112
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; points_done < num_points; points_done++) {
113 2 float Val = (*in_ptr++);
114 2 float n = (float)points_done;
115 2 float n_plus_one = n + 1.f;
116 2 float r = 1.f / (n * n_plus_one);
117 2 Sum[0] += Val;
118 2 SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
119 }
120 2 *stddev = sqrtf(SquareSum[0] / num_points);
121 2 *mean = Sum[0] / num_points;
122 }
123 #endif /* LV_HAVE_GENERIC */
124
125 88 static inline float update_square_sum_1_val(const float SquareSum,
126 const float Sum,
127 const uint32_t len,
128 const float val)
129 {
130 // Updates a sum of squares calculated over len values with the value val
131 88 float n = (float)len;
132 88 float n_plus_one = n + 1.f;
133 176 return SquareSum +
134 88 1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
135 }
136
137 88 static inline float add_square_sums(const float SquareSum0,
138 const float Sum0,
139 const float SquareSum1,
140 const float Sum1,
141 const uint32_t len)
142 {
143 // Add two sums of squares calculated over the same number of values, len
144 88 float n = (float)len;
145 88 return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
146 }
147
148 8 static inline void accrue_result(float* PartialSquareSums,
149 float* PartialSums,
150 const uint32_t NumberOfPartitions,
151 const uint32_t PartitionLen)
152 {
153 // Add all partial sums and square sums into the first element of the arrays
154 8 uint32_t accumulators = NumberOfPartitions;
155 8 uint32_t stages = 0;
156 8 uint32_t offset = 1;
157 8 uint32_t partition_len = PartitionLen;
158
159
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
36 while (accumulators >>= 1) {
160 28 stages++;
161 } // Integer log2
162 8 accumulators = NumberOfPartitions;
163
164
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
36 for (uint32_t s = 0; s < stages; s++) {
165 28 accumulators /= 2;
166 28 uint32_t idx = 0;
167
2/2
✓ Branch 0 taken 88 times.
✓ Branch 1 taken 28 times.
116 for (uint32_t a = 0; a < accumulators; a++) {
168 176 PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
169 88 PartialSums[idx],
170 88 PartialSquareSums[idx + offset],
171 88 PartialSums[idx + offset],
172 partition_len);
173 88 PartialSums[idx] += PartialSums[idx + offset];
174 88 idx += 2 * offset;
175 }
176 28 offset *= 2;
177 28 partition_len *= 2;
178 }
179 8 }
180
181 #ifdef LV_HAVE_NEON
182 #include <arm_neon.h>
183 #include <volk/volk_neon_intrinsics.h>
184
185 static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
186 float* mean,
187 const float* inputBuffer,
188 unsigned int num_points)
189 {
190 if (num_points < 8) {
191 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
192 return;
193 }
194
195 const float* in_ptr = inputBuffer;
196
197 __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
198 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
199
200 const uint32_t eigth_points = num_points / 8;
201
202 float32x4_t Sum0, Sum1;
203
204 Sum0 = vld1q_f32((const float32_t*)in_ptr);
205 in_ptr += 4;
206 __VOLK_PREFETCH(in_ptr + 4);
207
208 Sum1 = vld1q_f32((const float32_t*)in_ptr);
209 in_ptr += 4;
210 __VOLK_PREFETCH(in_ptr + 4);
211
212 float32x4_t SquareSum0 = { 0.f };
213 float32x4_t SquareSum1 = { 0.f };
214
215 float32x4_t Values0, Values1;
216 float32x4_t Aux0, Aux1;
217 float32x4_t Reciprocal;
218
219 for (uint32_t number = 1; number < eigth_points; number++) {
220 Values0 = vld1q_f32(in_ptr);
221 in_ptr += 4;
222 __VOLK_PREFETCH(in_ptr + 4);
223
224 Values1 = vld1q_f32(in_ptr);
225 in_ptr += 4;
226 __VOLK_PREFETCH(in_ptr + 4);
227
228 float n = (float)number;
229 float n_plus_one = n + 1.f;
230 Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
231
232 Sum0 = vaddq_f32(Sum0, Values0);
233 Aux0 = vdupq_n_f32(n_plus_one);
234 SquareSum0 =
235 _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
236
237 Sum1 = vaddq_f32(Sum1, Values1);
238 Aux1 = vdupq_n_f32(n_plus_one);
239 SquareSum1 =
240 _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
241 }
242
243 vst1q_f32(&SumLocal[0], Sum0);
244 vst1q_f32(&SumLocal[4], Sum1);
245 vst1q_f32(&SquareSumLocal[0], SquareSum0);
246 vst1q_f32(&SquareSumLocal[4], SquareSum1);
247
248 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
249
250 uint32_t points_done = eigth_points * 8;
251
252 for (; points_done < num_points; points_done++) {
253 float val = (*in_ptr++);
254 SumLocal[0] += val;
255 SquareSumLocal[0] =
256 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
257 }
258
259 *stddev = sqrtf(SquareSumLocal[0] / num_points);
260 *mean = SumLocal[0] / num_points;
261 }
262 #endif /* LV_HAVE_NEON */
263
264 #ifdef LV_HAVE_SSE
265 #include <volk/volk_sse_intrinsics.h>
266 #include <xmmintrin.h>
267
268 2 static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
269 float* mean,
270 const float* inputBuffer,
271 unsigned int num_points)
272 {
273
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (num_points < 8) {
274 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
275 return;
276 }
277
278 2 const float* in_ptr = inputBuffer;
279
280 2 __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
281 2 __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
282
283
284 2 const uint32_t eigth_points = num_points / 8;
285
286 2 __m128 Sum0 = _mm_loadu_ps(in_ptr);
287 2 in_ptr += 4;
288 2 __m128 Sum1 = _mm_loadu_ps(in_ptr);
289 2 in_ptr += 4;
290 2 __m128 SquareSum0 = _mm_setzero_ps();
291 2 __m128 SquareSum1 = _mm_setzero_ps();
292 __m128 Values0, Values1;
293 __m128 Aux0, Aux1;
294 __m128 Reciprocal;
295
296
2/2
✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.
32766 for (uint32_t number = 1; number < eigth_points; number++) {
297 32764 Values0 = _mm_loadu_ps(in_ptr);
298 32764 in_ptr += 4;
299 32764 __VOLK_PREFETCH(in_ptr + 4);
300
301 32764 Values1 = _mm_loadu_ps(in_ptr);
302 32764 in_ptr += 4;
303 32764 __VOLK_PREFETCH(in_ptr + 4);
304
305 32764 float n = (float)number;
306 32764 float n_plus_one = n + 1.f;
307 65528 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
308
309 32764 Sum0 = _mm_add_ps(Sum0, Values0);
310 32764 Aux0 = _mm_set_ps1(n_plus_one);
311 SquareSum0 =
312 32764 _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
313
314 32764 Sum1 = _mm_add_ps(Sum1, Values1);
315 32764 Aux1 = _mm_set_ps1(n_plus_one);
316 SquareSum1 =
317 32764 _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
318 }
319
320 _mm_store_ps(&SumLocal[0], Sum0);
321 _mm_store_ps(&SumLocal[4], Sum1);
322 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
323 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
324
325 2 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
326
327 2 uint32_t points_done = eigth_points * 8;
328
329
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; points_done < num_points; points_done++) {
330 14 float val = (*in_ptr++);
331 14 SumLocal[0] += val;
332 14 SquareSumLocal[0] =
333 14 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
334 }
335
336 2 *stddev = sqrtf(SquareSumLocal[0] / num_points);
337 2 *mean = SumLocal[0] / num_points;
338 }
339 #endif /* LV_HAVE_SSE */
340
341 #ifdef LV_HAVE_AVX
342 #include <immintrin.h>
343 #include <volk/volk_avx_intrinsics.h>
344
345 2 static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
346 float* mean,
347 const float* inputBuffer,
348 unsigned int num_points)
349 {
350
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (num_points < 16) {
351 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
352 return;
353 }
354
355 2 const float* in_ptr = inputBuffer;
356
357 2 __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
358 2 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
359
360 2 const unsigned int sixteenth_points = num_points / 16;
361
362 2 __m256 Sum0 = _mm256_loadu_ps(in_ptr);
363 2 in_ptr += 8;
364 2 __m256 Sum1 = _mm256_loadu_ps(in_ptr);
365 2 in_ptr += 8;
366
367 2 __m256 SquareSum0 = _mm256_setzero_ps();
368 2 __m256 SquareSum1 = _mm256_setzero_ps();
369 __m256 Values0, Values1;
370 __m256 Aux0, Aux1;
371 __m256 Reciprocal;
372
373
2/2
✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.
16382 for (uint32_t number = 1; number < sixteenth_points; number++) {
374 16380 Values0 = _mm256_loadu_ps(in_ptr);
375 16380 in_ptr += 8;
376 16380 __VOLK_PREFETCH(in_ptr + 8);
377
378 16380 Values1 = _mm256_loadu_ps(in_ptr);
379 16380 in_ptr += 8;
380 16380 __VOLK_PREFETCH(in_ptr + 8);
381
382 16380 float n = (float)number;
383 16380 float n_plus_one = n + 1.f;
384
385 32760 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
386
387 16380 Sum0 = _mm256_add_ps(Sum0, Values0);
388 16380 Aux0 = _mm256_set1_ps(n_plus_one);
389 SquareSum0 =
390 16380 _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
391
392 16380 Sum1 = _mm256_add_ps(Sum1, Values1);
393 16380 Aux1 = _mm256_set1_ps(n_plus_one);
394 SquareSum1 =
395 16380 _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
396 }
397
398 _mm256_store_ps(&SumLocal[0], Sum0);
399 _mm256_store_ps(&SumLocal[8], Sum1);
400 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
401 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
402
403 2 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
404
405 2 uint32_t points_done = sixteenth_points * 16;
406
407
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; points_done < num_points; points_done++) {
408 30 float val = (*in_ptr++);
409 30 SumLocal[0] += val;
410 30 SquareSumLocal[0] =
411 30 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
412 }
413
414 2 *stddev = sqrtf(SquareSumLocal[0] / num_points);
415 2 *mean = SumLocal[0] / num_points;
416 }
417 #endif /* LV_HAVE_AVX */
418
419 #ifdef LV_HAVE_SSE
420 #include <xmmintrin.h>
421
422 2 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
423 float* mean,
424 const float* inputBuffer,
425 unsigned int num_points)
426 {
427
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (num_points < 8) {
428 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
429 return;
430 }
431
432 2 const float* in_ptr = inputBuffer;
433
434 2 __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
435 2 __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
436
437
438 2 const uint32_t eigth_points = num_points / 8;
439
440 2 __m128 Sum0 = _mm_load_ps(in_ptr);
441 2 in_ptr += 4;
442 2 __m128 Sum1 = _mm_load_ps(in_ptr);
443 2 in_ptr += 4;
444 2 __m128 SquareSum0 = _mm_setzero_ps();
445 2 __m128 SquareSum1 = _mm_setzero_ps();
446 __m128 Values0, Values1;
447 __m128 Aux0, Aux1;
448 __m128 Reciprocal;
449
450
2/2
✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.
32766 for (uint32_t number = 1; number < eigth_points; number++) {
451 32764 Values0 = _mm_load_ps(in_ptr);
452 32764 in_ptr += 4;
453 32764 __VOLK_PREFETCH(in_ptr + 4);
454
455 32764 Values1 = _mm_load_ps(in_ptr);
456 32764 in_ptr += 4;
457 32764 __VOLK_PREFETCH(in_ptr + 4);
458
459 32764 float n = (float)number;
460 32764 float n_plus_one = n + 1.f;
461 65528 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
462
463 32764 Sum0 = _mm_add_ps(Sum0, Values0);
464 32764 Aux0 = _mm_set_ps1(n_plus_one);
465 SquareSum0 =
466 32764 _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
467
468 32764 Sum1 = _mm_add_ps(Sum1, Values1);
469 32764 Aux1 = _mm_set_ps1(n_plus_one);
470 SquareSum1 =
471 32764 _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
472 }
473
474 _mm_store_ps(&SumLocal[0], Sum0);
475 _mm_store_ps(&SumLocal[4], Sum1);
476 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
477 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
478
479 2 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
480
481 2 uint32_t points_done = eigth_points * 8;
482
483
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; points_done < num_points; points_done++) {
484 14 float val = (*in_ptr++);
485 14 SumLocal[0] += val;
486 14 SquareSumLocal[0] =
487 14 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
488 }
489
490 2 *stddev = sqrtf(SquareSumLocal[0] / num_points);
491 2 *mean = SumLocal[0] / num_points;
492 }
493 #endif /* LV_HAVE_SSE */
494
495 #ifdef LV_HAVE_AVX
496 #include <immintrin.h>
497
498 2 static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
499 float* mean,
500 const float* inputBuffer,
501 unsigned int num_points)
502 {
503
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (num_points < 16) {
504 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
505 return;
506 }
507
508 2 const float* in_ptr = inputBuffer;
509
510 2 __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
511 2 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
512
513 2 const unsigned int sixteenth_points = num_points / 16;
514
515 2 __m256 Sum0 = _mm256_load_ps(in_ptr);
516 2 in_ptr += 8;
517 2 __m256 Sum1 = _mm256_load_ps(in_ptr);
518 2 in_ptr += 8;
519
520 2 __m256 SquareSum0 = _mm256_setzero_ps();
521 2 __m256 SquareSum1 = _mm256_setzero_ps();
522 __m256 Values0, Values1;
523 __m256 Aux0, Aux1;
524 __m256 Reciprocal;
525
526
2/2
✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.
16382 for (uint32_t number = 1; number < sixteenth_points; number++) {
527 16380 Values0 = _mm256_load_ps(in_ptr);
528 16380 in_ptr += 8;
529 16380 __VOLK_PREFETCH(in_ptr + 8);
530
531 16380 Values1 = _mm256_load_ps(in_ptr);
532 16380 in_ptr += 8;
533 16380 __VOLK_PREFETCH(in_ptr + 8);
534
535 16380 float n = (float)number;
536 16380 float n_plus_one = n + 1.f;
537
538 32760 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
539
540 16380 Sum0 = _mm256_add_ps(Sum0, Values0);
541 16380 Aux0 = _mm256_set1_ps(n_plus_one);
542 SquareSum0 =
543 16380 _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
544
545 16380 Sum1 = _mm256_add_ps(Sum1, Values1);
546 16380 Aux1 = _mm256_set1_ps(n_plus_one);
547 SquareSum1 =
548 16380 _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
549 }
550
551 _mm256_store_ps(&SumLocal[0], Sum0);
552 _mm256_store_ps(&SumLocal[8], Sum1);
553 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
554 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
555
556 2 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
557
558 2 uint32_t points_done = sixteenth_points * 16;
559
560
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; points_done < num_points; points_done++) {
561 30 float val = (*in_ptr++);
562 30 SumLocal[0] += val;
563 30 SquareSumLocal[0] =
564 30 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
565 }
566
567 2 *stddev = sqrtf(SquareSumLocal[0] / num_points);
568 2 *mean = SumLocal[0] / num_points;
569 }
570 #endif /* LV_HAVE_AVX */
571
572 #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
573