| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2012, 2014, 2021 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32f_stddev_and_mean_32f_x2 | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * Computes the standard deviation and mean of the input buffer by means of | ||
| 16 | * Youngs and Cramer's Algorithm | ||
| 17 | * | ||
| 18 | * <b>Dispatcher Prototype</b> | ||
| 19 | * \code | ||
| 20 | * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* | ||
| 21 | * inputBuffer, unsigned int num_points) \endcode | ||
| 22 | * | ||
| 23 | * \b Inputs | ||
| 24 | * \li inputBuffer: The buffer of points. | ||
| 25 | * \li num_points The number of values in input buffer. | ||
| 26 | * | ||
| 27 | * \b Outputs | ||
| 28 | * \li stddev: The calculated standard deviation. | ||
| 29 | * \li mean: The mean of the input buffer. | ||
| 30 | * | ||
| 31 | * \b Example | ||
| 32 | * Generate random numbers with c++11's normal distribution and estimate the mean and | ||
| 33 | * standard deviation | ||
| 34 | * \code | ||
| 35 | * int N = 1000; | ||
| 36 | * unsigned int alignment = volk_get_alignment(); | ||
| 37 | * float* rand_numbers = (float*) volk_malloc(sizeof(float)*N, alignment); | ||
| 38 | * float* mean = (float*) volk_malloc(sizeof(float), alignment); | ||
| 39 | * float* stddev = (float*) volk_malloc(sizeof(float), alignment); | ||
| 40 | * | ||
| 41 | * // Use a normal generator with 0 mean, stddev 1000 | ||
| 42 | * std::default_random_engine generator; | ||
| 43 | * std::normal_distribution<float> distribution(0, 1000); | ||
| 44 | * | ||
| 45 | * for(unsigned int ii = 0; ii < N; ++ii) { | ||
| 46 | * rand_numbers[ii] = distribution(generator); | ||
| 47 | * } | ||
| 48 | * | ||
| 49 | * volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N); | ||
| 50 | * | ||
| 51 | * printf("std. dev. = %f\n", *stddev); | ||
| 52 | * printf("mean = %f\n", *mean); | ||
| 53 | * | ||
| 54 | * volk_free(rand_numbers); | ||
| 55 | * volk_free(mean); | ||
| 56 | * volk_free(stddev); | ||
| 57 | * \endcode | ||
| 58 | */ | ||
| 59 | |||
| 60 | #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H | ||
| 61 | #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H | ||
| 62 | |||
| 63 | #include <inttypes.h> | ||
| 64 | #include <math.h> | ||
| 65 | #include <volk/volk_common.h> | ||
| 66 | |||
| 67 | // Youngs and Cramer's Algorithm for calculating std and mean | ||
| 68 | // Using the methods discussed here: | ||
| 69 | // https://doi.org/10.1145/3221269.3223036 | ||
| 70 | #ifdef LV_HAVE_GENERIC | ||
| 71 | |||
| 72 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, | |
| 73 | float* mean, | ||
| 74 | const float* inputBuffer, | ||
| 75 | unsigned int num_points) | ||
| 76 | { | ||
| 77 | 2 | const float* in_ptr = inputBuffer; | |
| 78 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points == 0) { |
| 79 | ✗ | return; | |
| 80 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | } else if (num_points == 1) { |
| 81 | ✗ | *stddev = 0.f; | |
| 82 | ✗ | *mean = (*in_ptr); | |
| 83 | ✗ | return; | |
| 84 | } | ||
| 85 | |||
| 86 | float Sum[2]; | ||
| 87 | 2 | float SquareSum[2] = { 0.f, 0.f }; | |
| 88 | 2 | Sum[0] = (*in_ptr++); | |
| 89 | 2 | Sum[1] = (*in_ptr++); | |
| 90 | |||
| 91 | 2 | uint32_t half_points = num_points / 2; | |
| 92 | |||
| 93 |
2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.
|
131070 | for (uint32_t number = 1; number < half_points; number++) { |
| 94 | 131068 | float Val0 = (*in_ptr++); | |
| 95 | 131068 | float Val1 = (*in_ptr++); | |
| 96 | 131068 | float n = (float)number; | |
| 97 | 131068 | float n_plus_one = n + 1.f; | |
| 98 | 131068 | float r = 1.f / (n * n_plus_one); | |
| 99 | |||
| 100 | 131068 | Sum[0] += Val0; | |
| 101 | 131068 | Sum[1] += Val1; | |
| 102 | |||
| 103 | 131068 | SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2); | |
| 104 | 131068 | SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2); | |
| 105 | } | ||
| 106 | |||
| 107 | 2 | SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2); | |
| 108 | 2 | Sum[0] += Sum[1]; | |
| 109 | |||
| 110 | 2 | uint32_t points_done = half_points * 2; | |
| 111 | |||
| 112 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; points_done < num_points; points_done++) { |
| 113 | 2 | float Val = (*in_ptr++); | |
| 114 | 2 | float n = (float)points_done; | |
| 115 | 2 | float n_plus_one = n + 1.f; | |
| 116 | 2 | float r = 1.f / (n * n_plus_one); | |
| 117 | 2 | Sum[0] += Val; | |
| 118 | 2 | SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2); | |
| 119 | } | ||
| 120 | 2 | *stddev = sqrtf(SquareSum[0] / num_points); | |
| 121 | 2 | *mean = Sum[0] / num_points; | |
| 122 | } | ||
| 123 | #endif /* LV_HAVE_GENERIC */ | ||
| 124 | |||
| 125 | 88 | static inline float update_square_sum_1_val(const float SquareSum, | |
| 126 | const float Sum, | ||
| 127 | const uint32_t len, | ||
| 128 | const float val) | ||
| 129 | { | ||
| 130 | // Updates a sum of squares calculated over len values with the value val | ||
| 131 | 88 | float n = (float)len; | |
| 132 | 88 | float n_plus_one = n + 1.f; | |
| 133 | 176 | return SquareSum + | |
| 134 | 88 | 1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum); | |
| 135 | } | ||
| 136 | |||
| 137 | 88 | static inline float add_square_sums(const float SquareSum0, | |
| 138 | const float Sum0, | ||
| 139 | const float SquareSum1, | ||
| 140 | const float Sum1, | ||
| 141 | const uint32_t len) | ||
| 142 | { | ||
| 143 | // Add two sums of squares calculated over the same number of values, len | ||
| 144 | 88 | float n = (float)len; | |
| 145 | 88 | return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1); | |
| 146 | } | ||
| 147 | |||
| 148 | 8 | static inline void accrue_result(float* PartialSquareSums, | |
| 149 | float* PartialSums, | ||
| 150 | const uint32_t NumberOfPartitions, | ||
| 151 | const uint32_t PartitionLen) | ||
| 152 | { | ||
| 153 | // Add all partial sums and square sums into the first element of the arrays | ||
| 154 | 8 | uint32_t accumulators = NumberOfPartitions; | |
| 155 | 8 | uint32_t stages = 0; | |
| 156 | 8 | uint32_t offset = 1; | |
| 157 | 8 | uint32_t partition_len = PartitionLen; | |
| 158 | |||
| 159 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
|
36 | while (accumulators >>= 1) { |
| 160 | 28 | stages++; | |
| 161 | } // Integer log2 | ||
| 162 | 8 | accumulators = NumberOfPartitions; | |
| 163 | |||
| 164 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
|
36 | for (uint32_t s = 0; s < stages; s++) { |
| 165 | 28 | accumulators /= 2; | |
| 166 | 28 | uint32_t idx = 0; | |
| 167 |
2/2✓ Branch 0 taken 88 times.
✓ Branch 1 taken 28 times.
|
116 | for (uint32_t a = 0; a < accumulators; a++) { |
| 168 | 176 | PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx], | |
| 169 | 88 | PartialSums[idx], | |
| 170 | 88 | PartialSquareSums[idx + offset], | |
| 171 | 88 | PartialSums[idx + offset], | |
| 172 | partition_len); | ||
| 173 | 88 | PartialSums[idx] += PartialSums[idx + offset]; | |
| 174 | 88 | idx += 2 * offset; | |
| 175 | } | ||
| 176 | 28 | offset *= 2; | |
| 177 | 28 | partition_len *= 2; | |
| 178 | } | ||
| 179 | 8 | } | |
| 180 | |||
| 181 | #ifdef LV_HAVE_NEON | ||
| 182 | #include <arm_neon.h> | ||
| 183 | #include <volk/volk_neon_intrinsics.h> | ||
| 184 | |||
| 185 | static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev, | ||
| 186 | float* mean, | ||
| 187 | const float* inputBuffer, | ||
| 188 | unsigned int num_points) | ||
| 189 | { | ||
| 190 | if (num_points < 8) { | ||
| 191 | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | ||
| 192 | return; | ||
| 193 | } | ||
| 194 | |||
| 195 | const float* in_ptr = inputBuffer; | ||
| 196 | |||
| 197 | __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f }; | ||
| 198 | __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f }; | ||
| 199 | |||
| 200 | const uint32_t eigth_points = num_points / 8; | ||
| 201 | |||
| 202 | float32x4_t Sum0, Sum1; | ||
| 203 | |||
| 204 | Sum0 = vld1q_f32((const float32_t*)in_ptr); | ||
| 205 | in_ptr += 4; | ||
| 206 | __VOLK_PREFETCH(in_ptr + 4); | ||
| 207 | |||
| 208 | Sum1 = vld1q_f32((const float32_t*)in_ptr); | ||
| 209 | in_ptr += 4; | ||
| 210 | __VOLK_PREFETCH(in_ptr + 4); | ||
| 211 | |||
| 212 | float32x4_t SquareSum0 = { 0.f }; | ||
| 213 | float32x4_t SquareSum1 = { 0.f }; | ||
| 214 | |||
| 215 | float32x4_t Values0, Values1; | ||
| 216 | float32x4_t Aux0, Aux1; | ||
| 217 | float32x4_t Reciprocal; | ||
| 218 | |||
| 219 | for (uint32_t number = 1; number < eigth_points; number++) { | ||
| 220 | Values0 = vld1q_f32(in_ptr); | ||
| 221 | in_ptr += 4; | ||
| 222 | __VOLK_PREFETCH(in_ptr + 4); | ||
| 223 | |||
| 224 | Values1 = vld1q_f32(in_ptr); | ||
| 225 | in_ptr += 4; | ||
| 226 | __VOLK_PREFETCH(in_ptr + 4); | ||
| 227 | |||
| 228 | float n = (float)number; | ||
| 229 | float n_plus_one = n + 1.f; | ||
| 230 | Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one)); | ||
| 231 | |||
| 232 | Sum0 = vaddq_f32(Sum0, Values0); | ||
| 233 | Aux0 = vdupq_n_f32(n_plus_one); | ||
| 234 | SquareSum0 = | ||
| 235 | _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | ||
| 236 | |||
| 237 | Sum1 = vaddq_f32(Sum1, Values1); | ||
| 238 | Aux1 = vdupq_n_f32(n_plus_one); | ||
| 239 | SquareSum1 = | ||
| 240 | _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | ||
| 241 | } | ||
| 242 | |||
| 243 | vst1q_f32(&SumLocal[0], Sum0); | ||
| 244 | vst1q_f32(&SumLocal[4], Sum1); | ||
| 245 | vst1q_f32(&SquareSumLocal[0], SquareSum0); | ||
| 246 | vst1q_f32(&SquareSumLocal[4], SquareSum1); | ||
| 247 | |||
| 248 | accrue_result(SquareSumLocal, SumLocal, 8, eigth_points); | ||
| 249 | |||
| 250 | uint32_t points_done = eigth_points * 8; | ||
| 251 | |||
| 252 | for (; points_done < num_points; points_done++) { | ||
| 253 | float val = (*in_ptr++); | ||
| 254 | SumLocal[0] += val; | ||
| 255 | SquareSumLocal[0] = | ||
| 256 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | ||
| 257 | } | ||
| 258 | |||
| 259 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | ||
| 260 | *mean = SumLocal[0] / num_points; | ||
| 261 | } | ||
| 262 | #endif /* LV_HAVE_NEON */ | ||
| 263 | |||
| 264 | #ifdef LV_HAVE_SSE | ||
| 265 | #include <volk/volk_sse_intrinsics.h> | ||
| 266 | #include <xmmintrin.h> | ||
| 267 | |||
| 268 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev, | |
| 269 | float* mean, | ||
| 270 | const float* inputBuffer, | ||
| 271 | unsigned int num_points) | ||
| 272 | { | ||
| 273 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 8) { |
| 274 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
| 275 | ✗ | return; | |
| 276 | } | ||
| 277 | |||
| 278 | 2 | const float* in_ptr = inputBuffer; | |
| 279 | |||
| 280 | 2 | __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f }; | |
| 281 | 2 | __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f }; | |
| 282 | |||
| 283 | |||
| 284 | 2 | const uint32_t eigth_points = num_points / 8; | |
| 285 | |||
| 286 | 2 | __m128 Sum0 = _mm_loadu_ps(in_ptr); | |
| 287 | 2 | in_ptr += 4; | |
| 288 | 2 | __m128 Sum1 = _mm_loadu_ps(in_ptr); | |
| 289 | 2 | in_ptr += 4; | |
| 290 | 2 | __m128 SquareSum0 = _mm_setzero_ps(); | |
| 291 | 2 | __m128 SquareSum1 = _mm_setzero_ps(); | |
| 292 | __m128 Values0, Values1; | ||
| 293 | __m128 Aux0, Aux1; | ||
| 294 | __m128 Reciprocal; | ||
| 295 | |||
| 296 |
2/2✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.
|
32766 | for (uint32_t number = 1; number < eigth_points; number++) { |
| 297 | 32764 | Values0 = _mm_loadu_ps(in_ptr); | |
| 298 | 32764 | in_ptr += 4; | |
| 299 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
| 300 | |||
| 301 | 32764 | Values1 = _mm_loadu_ps(in_ptr); | |
| 302 | 32764 | in_ptr += 4; | |
| 303 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
| 304 | |||
| 305 | 32764 | float n = (float)number; | |
| 306 | 32764 | float n_plus_one = n + 1.f; | |
| 307 | 65528 | Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one)); | |
| 308 | |||
| 309 | 32764 | Sum0 = _mm_add_ps(Sum0, Values0); | |
| 310 | 32764 | Aux0 = _mm_set_ps1(n_plus_one); | |
| 311 | SquareSum0 = | ||
| 312 | 32764 | _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
| 313 | |||
| 314 | 32764 | Sum1 = _mm_add_ps(Sum1, Values1); | |
| 315 | 32764 | Aux1 = _mm_set_ps1(n_plus_one); | |
| 316 | SquareSum1 = | ||
| 317 | 32764 | _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
| 318 | } | ||
| 319 | |||
| 320 | _mm_store_ps(&SumLocal[0], Sum0); | ||
| 321 | _mm_store_ps(&SumLocal[4], Sum1); | ||
| 322 | _mm_store_ps(&SquareSumLocal[0], SquareSum0); | ||
| 323 | _mm_store_ps(&SquareSumLocal[4], SquareSum1); | ||
| 324 | |||
| 325 | 2 | accrue_result(SquareSumLocal, SumLocal, 8, eigth_points); | |
| 326 | |||
| 327 | 2 | uint32_t points_done = eigth_points * 8; | |
| 328 | |||
| 329 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; points_done < num_points; points_done++) { |
| 330 | 14 | float val = (*in_ptr++); | |
| 331 | 14 | SumLocal[0] += val; | |
| 332 | 14 | SquareSumLocal[0] = | |
| 333 | 14 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
| 334 | } | ||
| 335 | |||
| 336 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
| 337 | 2 | *mean = SumLocal[0] / num_points; | |
| 338 | } | ||
| 339 | #endif /* LV_HAVE_SSE */ | ||
| 340 | |||
| 341 | #ifdef LV_HAVE_AVX | ||
| 342 | #include <immintrin.h> | ||
| 343 | #include <volk/volk_avx_intrinsics.h> | ||
| 344 | |||
| 345 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, | |
| 346 | float* mean, | ||
| 347 | const float* inputBuffer, | ||
| 348 | unsigned int num_points) | ||
| 349 | { | ||
| 350 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 16) { |
| 351 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
| 352 | ✗ | return; | |
| 353 | } | ||
| 354 | |||
| 355 | 2 | const float* in_ptr = inputBuffer; | |
| 356 | |||
| 357 | 2 | __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f }; | |
| 358 | 2 | __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f }; | |
| 359 | |||
| 360 | 2 | const unsigned int sixteenth_points = num_points / 16; | |
| 361 | |||
| 362 | 2 | __m256 Sum0 = _mm256_loadu_ps(in_ptr); | |
| 363 | 2 | in_ptr += 8; | |
| 364 | 2 | __m256 Sum1 = _mm256_loadu_ps(in_ptr); | |
| 365 | 2 | in_ptr += 8; | |
| 366 | |||
| 367 | 2 | __m256 SquareSum0 = _mm256_setzero_ps(); | |
| 368 | 2 | __m256 SquareSum1 = _mm256_setzero_ps(); | |
| 369 | __m256 Values0, Values1; | ||
| 370 | __m256 Aux0, Aux1; | ||
| 371 | __m256 Reciprocal; | ||
| 372 | |||
| 373 |
2/2✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.
|
16382 | for (uint32_t number = 1; number < sixteenth_points; number++) { |
| 374 | 16380 | Values0 = _mm256_loadu_ps(in_ptr); | |
| 375 | 16380 | in_ptr += 8; | |
| 376 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
| 377 | |||
| 378 | 16380 | Values1 = _mm256_loadu_ps(in_ptr); | |
| 379 | 16380 | in_ptr += 8; | |
| 380 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
| 381 | |||
| 382 | 16380 | float n = (float)number; | |
| 383 | 16380 | float n_plus_one = n + 1.f; | |
| 384 | |||
| 385 | 32760 | Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one)); | |
| 386 | |||
| 387 | 16380 | Sum0 = _mm256_add_ps(Sum0, Values0); | |
| 388 | 16380 | Aux0 = _mm256_set1_ps(n_plus_one); | |
| 389 | SquareSum0 = | ||
| 390 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
| 391 | |||
| 392 | 16380 | Sum1 = _mm256_add_ps(Sum1, Values1); | |
| 393 | 16380 | Aux1 = _mm256_set1_ps(n_plus_one); | |
| 394 | SquareSum1 = | ||
| 395 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
| 396 | } | ||
| 397 | |||
| 398 | _mm256_store_ps(&SumLocal[0], Sum0); | ||
| 399 | _mm256_store_ps(&SumLocal[8], Sum1); | ||
| 400 | _mm256_store_ps(&SquareSumLocal[0], SquareSum0); | ||
| 401 | _mm256_store_ps(&SquareSumLocal[8], SquareSum1); | ||
| 402 | |||
| 403 | 2 | accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points); | |
| 404 | |||
| 405 | 2 | uint32_t points_done = sixteenth_points * 16; | |
| 406 | |||
| 407 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; points_done < num_points; points_done++) { |
| 408 | 30 | float val = (*in_ptr++); | |
| 409 | 30 | SumLocal[0] += val; | |
| 410 | 30 | SquareSumLocal[0] = | |
| 411 | 30 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
| 412 | } | ||
| 413 | |||
| 414 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
| 415 | 2 | *mean = SumLocal[0] / num_points; | |
| 416 | } | ||
| 417 | #endif /* LV_HAVE_AVX */ | ||
| 418 | |||
| 419 | #ifdef LV_HAVE_SSE | ||
| 420 | #include <xmmintrin.h> | ||
| 421 | |||
| 422 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, | |
| 423 | float* mean, | ||
| 424 | const float* inputBuffer, | ||
| 425 | unsigned int num_points) | ||
| 426 | { | ||
| 427 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 8) { |
| 428 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
| 429 | ✗ | return; | |
| 430 | } | ||
| 431 | |||
| 432 | 2 | const float* in_ptr = inputBuffer; | |
| 433 | |||
| 434 | 2 | __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f }; | |
| 435 | 2 | __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f }; | |
| 436 | |||
| 437 | |||
| 438 | 2 | const uint32_t eigth_points = num_points / 8; | |
| 439 | |||
| 440 | 2 | __m128 Sum0 = _mm_load_ps(in_ptr); | |
| 441 | 2 | in_ptr += 4; | |
| 442 | 2 | __m128 Sum1 = _mm_load_ps(in_ptr); | |
| 443 | 2 | in_ptr += 4; | |
| 444 | 2 | __m128 SquareSum0 = _mm_setzero_ps(); | |
| 445 | 2 | __m128 SquareSum1 = _mm_setzero_ps(); | |
| 446 | __m128 Values0, Values1; | ||
| 447 | __m128 Aux0, Aux1; | ||
| 448 | __m128 Reciprocal; | ||
| 449 | |||
| 450 |
2/2✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.
|
32766 | for (uint32_t number = 1; number < eigth_points; number++) { |
| 451 | 32764 | Values0 = _mm_load_ps(in_ptr); | |
| 452 | 32764 | in_ptr += 4; | |
| 453 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
| 454 | |||
| 455 | 32764 | Values1 = _mm_load_ps(in_ptr); | |
| 456 | 32764 | in_ptr += 4; | |
| 457 | 32764 | __VOLK_PREFETCH(in_ptr + 4); | |
| 458 | |||
| 459 | 32764 | float n = (float)number; | |
| 460 | 32764 | float n_plus_one = n + 1.f; | |
| 461 | 65528 | Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one)); | |
| 462 | |||
| 463 | 32764 | Sum0 = _mm_add_ps(Sum0, Values0); | |
| 464 | 32764 | Aux0 = _mm_set_ps1(n_plus_one); | |
| 465 | SquareSum0 = | ||
| 466 | 32764 | _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
| 467 | |||
| 468 | 32764 | Sum1 = _mm_add_ps(Sum1, Values1); | |
| 469 | 32764 | Aux1 = _mm_set_ps1(n_plus_one); | |
| 470 | SquareSum1 = | ||
| 471 | 32764 | _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
| 472 | } | ||
| 473 | |||
| 474 | _mm_store_ps(&SumLocal[0], Sum0); | ||
| 475 | _mm_store_ps(&SumLocal[4], Sum1); | ||
| 476 | _mm_store_ps(&SquareSumLocal[0], SquareSum0); | ||
| 477 | _mm_store_ps(&SquareSumLocal[4], SquareSum1); | ||
| 478 | |||
| 479 | 2 | accrue_result(SquareSumLocal, SumLocal, 8, eigth_points); | |
| 480 | |||
| 481 | 2 | uint32_t points_done = eigth_points * 8; | |
| 482 | |||
| 483 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; points_done < num_points; points_done++) { |
| 484 | 14 | float val = (*in_ptr++); | |
| 485 | 14 | SumLocal[0] += val; | |
| 486 | 14 | SquareSumLocal[0] = | |
| 487 | 14 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
| 488 | } | ||
| 489 | |||
| 490 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
| 491 | 2 | *mean = SumLocal[0] / num_points; | |
| 492 | } | ||
| 493 | #endif /* LV_HAVE_SSE */ | ||
| 494 | |||
| 495 | #ifdef LV_HAVE_AVX | ||
| 496 | #include <immintrin.h> | ||
| 497 | |||
| 498 | 2 | static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, | |
| 499 | float* mean, | ||
| 500 | const float* inputBuffer, | ||
| 501 | unsigned int num_points) | ||
| 502 | { | ||
| 503 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (num_points < 16) { |
| 504 | ✗ | volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); | |
| 505 | ✗ | return; | |
| 506 | } | ||
| 507 | |||
| 508 | 2 | const float* in_ptr = inputBuffer; | |
| 509 | |||
| 510 | 2 | __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f }; | |
| 511 | 2 | __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f }; | |
| 512 | |||
| 513 | 2 | const unsigned int sixteenth_points = num_points / 16; | |
| 514 | |||
| 515 | 2 | __m256 Sum0 = _mm256_load_ps(in_ptr); | |
| 516 | 2 | in_ptr += 8; | |
| 517 | 2 | __m256 Sum1 = _mm256_load_ps(in_ptr); | |
| 518 | 2 | in_ptr += 8; | |
| 519 | |||
| 520 | 2 | __m256 SquareSum0 = _mm256_setzero_ps(); | |
| 521 | 2 | __m256 SquareSum1 = _mm256_setzero_ps(); | |
| 522 | __m256 Values0, Values1; | ||
| 523 | __m256 Aux0, Aux1; | ||
| 524 | __m256 Reciprocal; | ||
| 525 | |||
| 526 |
2/2✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.
|
16382 | for (uint32_t number = 1; number < sixteenth_points; number++) { |
| 527 | 16380 | Values0 = _mm256_load_ps(in_ptr); | |
| 528 | 16380 | in_ptr += 8; | |
| 529 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
| 530 | |||
| 531 | 16380 | Values1 = _mm256_load_ps(in_ptr); | |
| 532 | 16380 | in_ptr += 8; | |
| 533 | 16380 | __VOLK_PREFETCH(in_ptr + 8); | |
| 534 | |||
| 535 | 16380 | float n = (float)number; | |
| 536 | 16380 | float n_plus_one = n + 1.f; | |
| 537 | |||
| 538 | 32760 | Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one)); | |
| 539 | |||
| 540 | 16380 | Sum0 = _mm256_add_ps(Sum0, Values0); | |
| 541 | 16380 | Aux0 = _mm256_set1_ps(n_plus_one); | |
| 542 | SquareSum0 = | ||
| 543 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0); | |
| 544 | |||
| 545 | 16380 | Sum1 = _mm256_add_ps(Sum1, Values1); | |
| 546 | 16380 | Aux1 = _mm256_set1_ps(n_plus_one); | |
| 547 | SquareSum1 = | ||
| 548 | 16380 | _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1); | |
| 549 | } | ||
| 550 | |||
| 551 | _mm256_store_ps(&SumLocal[0], Sum0); | ||
| 552 | _mm256_store_ps(&SumLocal[8], Sum1); | ||
| 553 | _mm256_store_ps(&SquareSumLocal[0], SquareSum0); | ||
| 554 | _mm256_store_ps(&SquareSumLocal[8], SquareSum1); | ||
| 555 | |||
| 556 | 2 | accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points); | |
| 557 | |||
| 558 | 2 | uint32_t points_done = sixteenth_points * 16; | |
| 559 | |||
| 560 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; points_done < num_points; points_done++) { |
| 561 | 30 | float val = (*in_ptr++); | |
| 562 | 30 | SumLocal[0] += val; | |
| 563 | 30 | SquareSumLocal[0] = | |
| 564 | 30 | update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val); | |
| 565 | } | ||
| 566 | |||
| 567 | 2 | *stddev = sqrtf(SquareSumLocal[0] / num_points); | |
| 568 | 2 | *mean = SumLocal[0] / num_points; | |
| 569 | } | ||
| 570 | #endif /* LV_HAVE_AVX */ | ||
| 571 | |||
| 572 | #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ | ||
| 573 |