GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	207	219	94.5%
Functions:	8	8	100.0%
Branches:	32	38	84.2%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_stddev_and_mean_32f_x2
    
       *
    
       * \b Overview
    
       *
    
       * Computes the standard deviation and mean of the input buffer by means of
    
       * Youngs and Cramer's Algorithm
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
    
       * inputBuffer, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li inputBuffer: The buffer of points.
    
       * \li num_points The number of values in input buffer.
    
       *
    
       * \b Outputs
    
       * \li stddev: The calculated standard deviation.
    
       * \li mean: The mean of the input buffer.
    
       *
    
       * \b Example
    
       * Generate random numbers with c++11's normal distribution and estimate the mean and
    
       * standard deviation
    
       * \code
    
       *   int N = 1000;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* rand_numbers = (float*) volk_malloc(sizeof(float)*N, alignment);
    
       *   float* mean = (float*) volk_malloc(sizeof(float), alignment);
    
       *   float* stddev = (float*) volk_malloc(sizeof(float), alignment);
    
       *
    
       *   // Use a normal generator with 0 mean, stddev 1000
    
       *   std::default_random_engine generator;
    
       *   std::normal_distribution<float> distribution(0, 1000);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii) {
    
       *       rand_numbers[ii] =  distribution(generator);
    
       *   }
    
       *
    
       *   volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N);
    
       *
    
       *   printf("std. dev. = %f\n", *stddev);
    
       *   printf("mean = %f\n", *mean);
    
       *
    
       *   volk_free(rand_numbers);
    
       *   volk_free(mean);
    
       *   volk_free(stddev);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
    
      #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <volk/volk_common.h>
    
      // Youngs and Cramer's Algorithm for calculating std and mean
    
      //   Using the methods discussed here:
    
      //   https://doi.org/10.1145/3221269.3223036
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
    
                                                                 float* mean,
    
                                                                 const float* inputBuffer,
    
                                                                 unsigned int num_points)
    
      {
    
      2
          const float* in_ptr = inputBuffer;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (num_points == 0) {
    
      ✗
              return;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          } else if (num_points == 1) {
    
      ✗
              *stddev = 0.f;
    
      ✗
              *mean = (*in_ptr);
    
      ✗
              return;
    
          }
    
          float Sum[2];
    
      2
          float SquareSum[2] = { 0.f, 0.f };
    
      2
          Sum[0] = (*in_ptr++);
    
      2
          Sum[1] = (*in_ptr++);
    
      2
          uint32_t half_points = num_points / 2;
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 2 times.

      131070
          for (uint32_t number = 1; number < half_points; number++) {
    
      131068
              float Val0 = (*in_ptr++);
    
      131068
              float Val1 = (*in_ptr++);
    
      131068
              float n = (float)number;
    
      131068
              float n_plus_one = n + 1.f;
    
      131068
              float r = 1.f / (n * n_plus_one);
    
      131068
              Sum[0] += Val0;
    
      131068
              Sum[1] += Val1;
    
      131068
              SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
    
      131068
              SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
    
          }
    
      2
          SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
    
      2
          Sum[0] += Sum[1];
    
      2
          uint32_t points_done = half_points * 2;
    
        2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.

      4
          for (; points_done < num_points; points_done++) {
    
      2
              float Val = (*in_ptr++);
    
      2
              float n = (float)points_done;
    
      2
              float n_plus_one = n + 1.f;
    
      2
              float r = 1.f / (n * n_plus_one);
    
      2
              Sum[0] += Val;
    
      2
              SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
    
          }
    
      2
          *stddev = sqrtf(SquareSum[0] / num_points);
    
      2
          *mean = Sum[0] / num_points;
    
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      88
      static inline float update_square_sum_1_val(const float SquareSum,
    
                                                  const float Sum,
    
                                                  const uint32_t len,
    
                                                  const float val)
    
      {
    
          // Updates a sum of squares calculated over len values with the value val
    
      88
          float n = (float)len;
    
      88
          float n_plus_one = n + 1.f;
    
      176
          return SquareSum +
    
      88
                 1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
    
      }
    
      88
      static inline float add_square_sums(const float SquareSum0,
    
                                          const float Sum0,
    
                                          const float SquareSum1,
    
                                          const float Sum1,
    
                                          const uint32_t len)
    
      {
    
          // Add two sums of squares calculated over the same number of values, len
    
      88
          float n = (float)len;
    
      88
          return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
    
      }
    
      8
      static inline void accrue_result(float* PartialSquareSums,
    
                                       float* PartialSums,
    
                                       const uint32_t NumberOfPartitions,
    
                                       const uint32_t PartitionLen)
    
      {
    
          // Add all partial sums and square sums into the first element of the arrays
    
      8
          uint32_t accumulators = NumberOfPartitions;
    
      8
          uint32_t stages = 0;
    
      8
          uint32_t offset = 1;
    
      8
          uint32_t partition_len = PartitionLen;
    
        2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.

      36
          while (accumulators >>= 1) {
    
      28
              stages++;
    
          } // Integer log2
    
      8
          accumulators = NumberOfPartitions;
    
        2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.

      36
          for (uint32_t s = 0; s < stages; s++) {
    
      28
              accumulators /= 2;
    
      28
              uint32_t idx = 0;
    
        2/2✓ Branch 0 taken 88 times.
✓ Branch 1 taken 28 times.

      116
              for (uint32_t a = 0; a < accumulators; a++) {
    
      176
                  PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
    
      88
                                                           PartialSums[idx],
    
      88
                                                           PartialSquareSums[idx + offset],
    
      88
                                                           PartialSums[idx + offset],
    
                                                           partition_len);
    
      88
                  PartialSums[idx] += PartialSums[idx + offset];
    
      88
                  idx += 2 * offset;
    
              }
    
      28
              offset *= 2;
    
      28
              partition_len *= 2;
    
          }
    
      8
      }
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      #include <volk/volk_neon_intrinsics.h>
    
      static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
    
                                                              float* mean,
    
                                                              const float* inputBuffer,
    
                                                              unsigned int num_points)
    
      {
    
          if (num_points < 8) {
    
              volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
    
              return;
    
          }
    
          const float* in_ptr = inputBuffer;
    
          __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
    
          __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
    
          const uint32_t eigth_points = num_points / 8;
    
          float32x4_t Sum0, Sum1;
    
          Sum0 = vld1q_f32((const float32_t*)in_ptr);
    
          in_ptr += 4;
    
          __VOLK_PREFETCH(in_ptr + 4);
    
          Sum1 = vld1q_f32((const float32_t*)in_ptr);
    
          in_ptr += 4;
    
          __VOLK_PREFETCH(in_ptr + 4);
    
          float32x4_t SquareSum0 = { 0.f };
    
          float32x4_t SquareSum1 = { 0.f };
    
          float32x4_t Values0, Values1;
    
          float32x4_t Aux0, Aux1;
    
          float32x4_t Reciprocal;
    
          for (uint32_t number = 1; number < eigth_points; number++) {
    
              Values0 = vld1q_f32(in_ptr);
    
              in_ptr += 4;
    
              __VOLK_PREFETCH(in_ptr + 4);
    
              Values1 = vld1q_f32(in_ptr);
    
              in_ptr += 4;
    
              __VOLK_PREFETCH(in_ptr + 4);
    
              float n = (float)number;
    
              float n_plus_one = n + 1.f;
    
              Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
    
              Sum0 = vaddq_f32(Sum0, Values0);
    
              Aux0 = vdupq_n_f32(n_plus_one);
    
              SquareSum0 =
    
                  _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
    
              Sum1 = vaddq_f32(Sum1, Values1);
    
              Aux1 = vdupq_n_f32(n_plus_one);
    
              SquareSum1 =
    
                  _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
    
          }
    
          vst1q_f32(&SumLocal[0], Sum0);
    
          vst1q_f32(&SumLocal[4], Sum1);
    
          vst1q_f32(&SquareSumLocal[0], SquareSum0);
    
          vst1q_f32(&SquareSumLocal[4], SquareSum1);
    
          accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
    
          uint32_t points_done = eigth_points * 8;
    
          for (; points_done < num_points; points_done++) {
    
              float val = (*in_ptr++);
    
              SumLocal[0] += val;
    
              SquareSumLocal[0] =
    
                  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
    
          }
    
          *stddev = sqrtf(SquareSumLocal[0] / num_points);
    
          *mean = SumLocal[0] / num_points;
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_SSE
    
      #include <volk/volk_sse_intrinsics.h>
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
    
                                                               float* mean,
    
                                                               const float* inputBuffer,
    
                                                               unsigned int num_points)
    
      {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (num_points < 8) {
    
      ✗
              volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
    
      ✗
              return;
    
          }
    
      2
          const float* in_ptr = inputBuffer;
    
      2
          __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
    
      2
          __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
    
      2
          const uint32_t eigth_points = num_points / 8;
    
      2
          __m128 Sum0 = _mm_loadu_ps(in_ptr);
    
      2
          in_ptr += 4;
    
      2
          __m128 Sum1 = _mm_loadu_ps(in_ptr);
    
      2
          in_ptr += 4;
    
      2
          __m128 SquareSum0 = _mm_setzero_ps();
    
      2
          __m128 SquareSum1 = _mm_setzero_ps();
    
          __m128 Values0, Values1;
    
          __m128 Aux0, Aux1;
    
          __m128 Reciprocal;
    
        2/2✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.

      32766
          for (uint32_t number = 1; number < eigth_points; number++) {
    
      32764
              Values0 = _mm_loadu_ps(in_ptr);
    
      32764
              in_ptr += 4;
    
      32764
              __VOLK_PREFETCH(in_ptr + 4);
    
      32764
              Values1 = _mm_loadu_ps(in_ptr);
    
      32764
              in_ptr += 4;
    
      32764
              __VOLK_PREFETCH(in_ptr + 4);
    
      32764
              float n = (float)number;
    
      32764
              float n_plus_one = n + 1.f;
    
      65528
              Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
    
      32764
              Sum0 = _mm_add_ps(Sum0, Values0);
    
      32764
              Aux0 = _mm_set_ps1(n_plus_one);
    
              SquareSum0 =
    
      32764
                  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
    
      32764
              Sum1 = _mm_add_ps(Sum1, Values1);
    
      32764
              Aux1 = _mm_set_ps1(n_plus_one);
    
              SquareSum1 =
    
      32764
                  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
    
          }
    
          _mm_store_ps(&SumLocal[0], Sum0);
    
          _mm_store_ps(&SumLocal[4], Sum1);
    
          _mm_store_ps(&SquareSumLocal[0], SquareSum0);
    
          _mm_store_ps(&SquareSumLocal[4], SquareSum1);
    
      2
          accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
    
      2
          uint32_t points_done = eigth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; points_done < num_points; points_done++) {
    
      14
              float val = (*in_ptr++);
    
      14
              SumLocal[0] += val;
    
      14
              SquareSumLocal[0] =
    
      14
                  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
    
          }
    
      2
          *stddev = sqrtf(SquareSumLocal[0] / num_points);
    
      2
          *mean = SumLocal[0] / num_points;
    
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
    
                                                               float* mean,
    
                                                               const float* inputBuffer,
    
                                                               unsigned int num_points)
    
      {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (num_points < 16) {
    
      ✗
              volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
    
      ✗
              return;
    
          }
    
      2
          const float* in_ptr = inputBuffer;
    
      2
          __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
    
      2
          __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
    
      2
          const unsigned int sixteenth_points = num_points / 16;
    
      2
          __m256 Sum0 = _mm256_loadu_ps(in_ptr);
    
      2
          in_ptr += 8;
    
      2
          __m256 Sum1 = _mm256_loadu_ps(in_ptr);
    
      2
          in_ptr += 8;
    
      2
          __m256 SquareSum0 = _mm256_setzero_ps();
    
      2
          __m256 SquareSum1 = _mm256_setzero_ps();
    
          __m256 Values0, Values1;
    
          __m256 Aux0, Aux1;
    
          __m256 Reciprocal;
    
        2/2✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.

      16382
          for (uint32_t number = 1; number < sixteenth_points; number++) {
    
      16380
              Values0 = _mm256_loadu_ps(in_ptr);
    
      16380
              in_ptr += 8;
    
      16380
              __VOLK_PREFETCH(in_ptr + 8);
    
      16380
              Values1 = _mm256_loadu_ps(in_ptr);
    
      16380
              in_ptr += 8;
    
      16380
              __VOLK_PREFETCH(in_ptr + 8);
    
      16380
              float n = (float)number;
    
      16380
              float n_plus_one = n + 1.f;
    
      32760
              Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
    
      16380
              Sum0 = _mm256_add_ps(Sum0, Values0);
    
      16380
              Aux0 = _mm256_set1_ps(n_plus_one);
    
              SquareSum0 =
    
      16380
                  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
    
      16380
              Sum1 = _mm256_add_ps(Sum1, Values1);
    
      16380
              Aux1 = _mm256_set1_ps(n_plus_one);
    
              SquareSum1 =
    
      16380
                  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
    
          }
    
          _mm256_store_ps(&SumLocal[0], Sum0);
    
          _mm256_store_ps(&SumLocal[8], Sum1);
    
          _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
    
          _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
    
      2
          accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
    
      2
          uint32_t points_done = sixteenth_points * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; points_done < num_points; points_done++) {
    
      30
              float val = (*in_ptr++);
    
      30
              SumLocal[0] += val;
    
      30
              SquareSumLocal[0] =
    
      30
                  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
    
          }
    
      2
          *stddev = sqrtf(SquareSumLocal[0] / num_points);
    
      2
          *mean = SumLocal[0] / num_points;
    
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
    
                                                               float* mean,
    
                                                               const float* inputBuffer,
    
                                                               unsigned int num_points)
    
      {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (num_points < 8) {
    
      ✗
              volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
    
      ✗
              return;
    
          }
    
      2
          const float* in_ptr = inputBuffer;
    
      2
          __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
    
      2
          __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
    
      2
          const uint32_t eigth_points = num_points / 8;
    
      2
          __m128 Sum0 = _mm_load_ps(in_ptr);
    
      2
          in_ptr += 4;
    
      2
          __m128 Sum1 = _mm_load_ps(in_ptr);
    
      2
          in_ptr += 4;
    
      2
          __m128 SquareSum0 = _mm_setzero_ps();
    
      2
          __m128 SquareSum1 = _mm_setzero_ps();
    
          __m128 Values0, Values1;
    
          __m128 Aux0, Aux1;
    
          __m128 Reciprocal;
    
        2/2✓ Branch 0 taken 32764 times.
✓ Branch 1 taken 2 times.

      32766
          for (uint32_t number = 1; number < eigth_points; number++) {
    
      32764
              Values0 = _mm_load_ps(in_ptr);
    
      32764
              in_ptr += 4;
    
      32764
              __VOLK_PREFETCH(in_ptr + 4);
    
      32764
              Values1 = _mm_load_ps(in_ptr);
    
      32764
              in_ptr += 4;
    
      32764
              __VOLK_PREFETCH(in_ptr + 4);
    
      32764
              float n = (float)number;
    
      32764
              float n_plus_one = n + 1.f;
    
      65528
              Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
    
      32764
              Sum0 = _mm_add_ps(Sum0, Values0);
    
      32764
              Aux0 = _mm_set_ps1(n_plus_one);
    
              SquareSum0 =
    
      32764
                  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
    
      32764
              Sum1 = _mm_add_ps(Sum1, Values1);
    
      32764
              Aux1 = _mm_set_ps1(n_plus_one);
    
              SquareSum1 =
    
      32764
                  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
    
          }
    
          _mm_store_ps(&SumLocal[0], Sum0);
    
          _mm_store_ps(&SumLocal[4], Sum1);
    
          _mm_store_ps(&SquareSumLocal[0], SquareSum0);
    
          _mm_store_ps(&SquareSumLocal[4], SquareSum1);
    
      2
          accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
    
      2
          uint32_t points_done = eigth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; points_done < num_points; points_done++) {
    
      14
              float val = (*in_ptr++);
    
      14
              SumLocal[0] += val;
    
      14
              SquareSumLocal[0] =
    
      14
                  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
    
          }
    
      2
          *stddev = sqrtf(SquareSumLocal[0] / num_points);
    
      2
          *mean = SumLocal[0] / num_points;
    
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
    
                                                               float* mean,
    
                                                               const float* inputBuffer,
    
                                                               unsigned int num_points)
    
      {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (num_points < 16) {
    
      ✗
              volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
    
      ✗
              return;
    
          }
    
      2
          const float* in_ptr = inputBuffer;
    
      2
          __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
    
      2
          __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
    
      2
          const unsigned int sixteenth_points = num_points / 16;
    
      2
          __m256 Sum0 = _mm256_load_ps(in_ptr);
    
      2
          in_ptr += 8;
    
      2
          __m256 Sum1 = _mm256_load_ps(in_ptr);
    
      2
          in_ptr += 8;
    
      2
          __m256 SquareSum0 = _mm256_setzero_ps();
    
      2
          __m256 SquareSum1 = _mm256_setzero_ps();
    
          __m256 Values0, Values1;
    
          __m256 Aux0, Aux1;
    
          __m256 Reciprocal;
    
        2/2✓ Branch 0 taken 16380 times.
✓ Branch 1 taken 2 times.

      16382
          for (uint32_t number = 1; number < sixteenth_points; number++) {
    
      16380
              Values0 = _mm256_load_ps(in_ptr);
    
      16380
              in_ptr += 8;
    
      16380
              __VOLK_PREFETCH(in_ptr + 8);
    
      16380
              Values1 = _mm256_load_ps(in_ptr);
    
      16380
              in_ptr += 8;
    
      16380
              __VOLK_PREFETCH(in_ptr + 8);
    
      16380
              float n = (float)number;
    
      16380
              float n_plus_one = n + 1.f;
    
      32760
              Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
    
      16380
              Sum0 = _mm256_add_ps(Sum0, Values0);
    
      16380
              Aux0 = _mm256_set1_ps(n_plus_one);
    
              SquareSum0 =
    
      16380
                  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
    
      16380
              Sum1 = _mm256_add_ps(Sum1, Values1);
    
      16380
              Aux1 = _mm256_set1_ps(n_plus_one);
    
              SquareSum1 =
    
      16380
                  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
    
          }
    
          _mm256_store_ps(&SumLocal[0], Sum0);
    
          _mm256_store_ps(&SumLocal[8], Sum1);
    
          _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
    
          _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
    
      2
          accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
    
      2
          uint32_t points_done = sixteenth_points * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; points_done < num_points; points_done++) {
    
      30
              float val = (*in_ptr++);
    
      30
              SumLocal[0] += val;
    
      30
              SquareSumLocal[0] =
    
      30
                  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
    
          }
    
      2
          *stddev = sqrtf(SquareSumLocal[0] / num_points);
    
      2
          *mean = SumLocal[0] / num_points;
    
      }
    
      #endif /* LV_HAVE_AVX */
    
      #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_stddev_and_mean_32f_x2
12			*
13			* \b Overview
14			*
15			* Computes the standard deviation and mean of the input buffer by means of
16			* Youngs and Cramer's Algorithm
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
21			* inputBuffer, unsigned int num_points) \endcode
22			*
23			* \b Inputs
24			* \li inputBuffer: The buffer of points.
25			* \li num_points The number of values in input buffer.
26			*
27			* \b Outputs
28			* \li stddev: The calculated standard deviation.
29			* \li mean: The mean of the input buffer.
30			*
31			* \b Example
32			* Generate random numbers with c++11's normal distribution and estimate the mean and
33			* standard deviation
34			* \code
35			* int N = 1000;
36			* unsigned int alignment = volk_get_alignment();
37			* float* rand_numbers = (float) volk_malloc(sizeof(float)N, alignment);
38			* float* mean = (float*) volk_malloc(sizeof(float), alignment);
39			* float* stddev = (float*) volk_malloc(sizeof(float), alignment);
40			*
41			* // Use a normal generator with 0 mean, stddev 1000
42			* std::default_random_engine generator;
43			* std::normal_distribution<float> distribution(0, 1000);
44			*
45			* for(unsigned int ii = 0; ii < N; ++ii) {
46			* rand_numbers[ii] = distribution(generator);
47			* }
48			*
49			* volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N);
50			*
51			* printf("std. dev. = %f\n", *stddev);
52			* printf("mean = %f\n", *mean);
53			*
54			* volk_free(rand_numbers);
55			* volk_free(mean);
56			* volk_free(stddev);
57			* \endcode
58			*/
59
60			#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
61			#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
62
63			#include <inttypes.h>
64			#include <math.h>
65			#include <volk/volk_common.h>
66
67			// Youngs and Cramer's Algorithm for calculating std and mean
68			// Using the methods discussed here:
69			// https://doi.org/10.1145/3221269.3223036
70			#ifdef LV_HAVE_GENERIC
71
72		2	static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
73			float* mean,
74			const float* inputBuffer,
75			unsigned int num_points)
76			{
77		2	const float* in_ptr = inputBuffer;
78	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	if (num_points == 0) {
79		✗	return;
80	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	} else if (num_points == 1) {
81		✗	*stddev = 0.f;
82		✗	mean = (in_ptr);
83		✗	return;
84			}
85
86			float Sum[2];
87		2	float SquareSum[2] = { 0.f, 0.f };
88		2	Sum[0] = (*in_ptr++);
89		2	Sum[1] = (*in_ptr++);
90
91		2	uint32_t half_points = num_points / 2;
92
93	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 2 times.	131070	for (uint32_t number = 1; number < half_points; number++) {
94		131068	float Val0 = (*in_ptr++);
95		131068	float Val1 = (*in_ptr++);
96		131068	float n = (float)number;
97		131068	float n_plus_one = n + 1.f;
98		131068	float r = 1.f / (n * n_plus_one);
99
100		131068	Sum[0] += Val0;
101		131068	Sum[1] += Val1;
102
103		131068	SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
104		131068	SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
105			}
106
107		2	SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
108		2	Sum[0] += Sum[1];
109
110		2	uint32_t points_done = half_points * 2;
111
112	2/2 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 2 times.	4	for (; points_done < num_points; points_done++) {
113		2	float Val = (*in_ptr++);
114		2	float n = (float)points_done;
115		2	float n_plus_one = n + 1.f;
116		2	float r = 1.f / (n * n_plus_one);
117		2	Sum[0] += Val;
118		2	SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
119			}
120		2	*stddev = sqrtf(SquareSum[0] / num_points);
121		2	*mean = Sum[0] / num_points;
122			}
123			#endif /* LV_HAVE_GENERIC */
124
125		88	static inline float update_square_sum_1_val(const float SquareSum,
126			const float Sum,
127			const uint32_t len,
128			const float val)
129			{
130			// Updates a sum of squares calculated over len values with the value val
131		88	float n = (float)len;
132		88	float n_plus_one = n + 1.f;
133		176	return SquareSum +
134		88	1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
135			}
136
137		88	static inline float add_square_sums(const float SquareSum0,
138			const float Sum0,
139			const float SquareSum1,
140			const float Sum1,
141			const uint32_t len)
142			{
143			// Add two sums of squares calculated over the same number of values, len
144		88	float n = (float)len;
145		88	return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
146			}
147
148		8	static inline void accrue_result(float* PartialSquareSums,
149			float* PartialSums,
150			const uint32_t NumberOfPartitions,
151			const uint32_t PartitionLen)
152			{
153			// Add all partial sums and square sums into the first element of the arrays
154		8	uint32_t accumulators = NumberOfPartitions;
155		8	uint32_t stages = 0;
156		8	uint32_t offset = 1;
157		8	uint32_t partition_len = PartitionLen;
158
159	2/2 ✓ Branch 0 taken 28 times. ✓ Branch 1 taken 8 times.	36	while (accumulators >>= 1) {
160		28	stages++;
161			} // Integer log2
162		8	accumulators = NumberOfPartitions;
163
164	2/2 ✓ Branch 0 taken 28 times. ✓ Branch 1 taken 8 times.	36	for (uint32_t s = 0; s < stages; s++) {
165		28	accumulators /= 2;
166		28	uint32_t idx = 0;
167	2/2 ✓ Branch 0 taken 88 times. ✓ Branch 1 taken 28 times.	116	for (uint32_t a = 0; a < accumulators; a++) {
168		176	PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
169		88	PartialSums[idx],
170		88	PartialSquareSums[idx + offset],
171		88	PartialSums[idx + offset],
172			partition_len);
173		88	PartialSums[idx] += PartialSums[idx + offset];
174		88	idx += 2 * offset;
175			}
176		28	offset *= 2;
177		28	partition_len *= 2;
178			}
179		8	}
180
181			#ifdef LV_HAVE_NEON
182			#include <arm_neon.h>
183			#include <volk/volk_neon_intrinsics.h>
184
185			static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
186			float* mean,
187			const float* inputBuffer,
188			unsigned int num_points)
189			{
190			if (num_points < 8) {
191			volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
192			return;
193			}
194
195			const float* in_ptr = inputBuffer;
196
197			__VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
198			__VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
199
200			const uint32_t eigth_points = num_points / 8;
201
202			float32x4_t Sum0, Sum1;
203
204			Sum0 = vld1q_f32((const float32_t*)in_ptr);
205			in_ptr += 4;
206			__VOLK_PREFETCH(in_ptr + 4);
207
208			Sum1 = vld1q_f32((const float32_t*)in_ptr);
209			in_ptr += 4;
210			__VOLK_PREFETCH(in_ptr + 4);
211
212			float32x4_t SquareSum0 = { 0.f };
213			float32x4_t SquareSum1 = { 0.f };
214
215			float32x4_t Values0, Values1;
216			float32x4_t Aux0, Aux1;
217			float32x4_t Reciprocal;
218
219			for (uint32_t number = 1; number < eigth_points; number++) {
220			Values0 = vld1q_f32(in_ptr);
221			in_ptr += 4;
222			__VOLK_PREFETCH(in_ptr + 4);
223
224			Values1 = vld1q_f32(in_ptr);
225			in_ptr += 4;
226			__VOLK_PREFETCH(in_ptr + 4);
227
228			float n = (float)number;
229			float n_plus_one = n + 1.f;
230			Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
231
232			Sum0 = vaddq_f32(Sum0, Values0);
233			Aux0 = vdupq_n_f32(n_plus_one);
234			SquareSum0 =
235			_neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
236
237			Sum1 = vaddq_f32(Sum1, Values1);
238			Aux1 = vdupq_n_f32(n_plus_one);
239			SquareSum1 =
240			_neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
241			}
242
243			vst1q_f32(&SumLocal[0], Sum0);
244			vst1q_f32(&SumLocal[4], Sum1);
245			vst1q_f32(&SquareSumLocal[0], SquareSum0);
246			vst1q_f32(&SquareSumLocal[4], SquareSum1);
247
248			accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
249
250			uint32_t points_done = eigth_points * 8;
251
252			for (; points_done < num_points; points_done++) {
253			float val = (*in_ptr++);
254			SumLocal[0] += val;
255			SquareSumLocal[0] =
256			update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
257			}
258
259			*stddev = sqrtf(SquareSumLocal[0] / num_points);
260			*mean = SumLocal[0] / num_points;
261			}
262			#endif /* LV_HAVE_NEON */
263
264			#ifdef LV_HAVE_SSE
265			#include <volk/volk_sse_intrinsics.h>
266			#include <xmmintrin.h>
267
268		2	static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
269			float* mean,
270			const float* inputBuffer,
271			unsigned int num_points)
272			{
273	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	if (num_points < 8) {
274		✗	volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
275		✗	return;
276			}
277
278		2	const float* in_ptr = inputBuffer;
279
280		2	__VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
281		2	__VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
282
283
284		2	const uint32_t eigth_points = num_points / 8;
285
286		2	__m128 Sum0 = _mm_loadu_ps(in_ptr);
287		2	in_ptr += 4;
288		2	__m128 Sum1 = _mm_loadu_ps(in_ptr);
289		2	in_ptr += 4;
290		2	__m128 SquareSum0 = _mm_setzero_ps();
291		2	__m128 SquareSum1 = _mm_setzero_ps();
292			__m128 Values0, Values1;
293			__m128 Aux0, Aux1;
294			__m128 Reciprocal;
295
296	2/2 ✓ Branch 0 taken 32764 times. ✓ Branch 1 taken 2 times.	32766	for (uint32_t number = 1; number < eigth_points; number++) {
297		32764	Values0 = _mm_loadu_ps(in_ptr);
298		32764	in_ptr += 4;
299		32764	__VOLK_PREFETCH(in_ptr + 4);
300
301		32764	Values1 = _mm_loadu_ps(in_ptr);
302		32764	in_ptr += 4;
303		32764	__VOLK_PREFETCH(in_ptr + 4);
304
305		32764	float n = (float)number;
306		32764	float n_plus_one = n + 1.f;
307		65528	Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
308
309		32764	Sum0 = _mm_add_ps(Sum0, Values0);
310		32764	Aux0 = _mm_set_ps1(n_plus_one);
311			SquareSum0 =
312		32764	_mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
313
314		32764	Sum1 = _mm_add_ps(Sum1, Values1);
315		32764	Aux1 = _mm_set_ps1(n_plus_one);
316			SquareSum1 =
317		32764	_mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
318			}
319
320			_mm_store_ps(&SumLocal[0], Sum0);
321			_mm_store_ps(&SumLocal[4], Sum1);
322			_mm_store_ps(&SquareSumLocal[0], SquareSum0);
323			_mm_store_ps(&SquareSumLocal[4], SquareSum1);
324
325		2	accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
326
327		2	uint32_t points_done = eigth_points * 8;
328
329	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; points_done < num_points; points_done++) {
330		14	float val = (*in_ptr++);
331		14	SumLocal[0] += val;
332		14	SquareSumLocal[0] =
333		14	update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
334			}
335
336		2	*stddev = sqrtf(SquareSumLocal[0] / num_points);
337		2	*mean = SumLocal[0] / num_points;
338			}
339			#endif /* LV_HAVE_SSE */
340
341			#ifdef LV_HAVE_AVX
342			#include <immintrin.h>
343			#include <volk/volk_avx_intrinsics.h>
344
345		2	static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
346			float* mean,
347			const float* inputBuffer,
348			unsigned int num_points)
349			{
350	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	if (num_points < 16) {
351		✗	volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
352		✗	return;
353			}
354
355		2	const float* in_ptr = inputBuffer;
356
357		2	__VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
358		2	__VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
359
360		2	const unsigned int sixteenth_points = num_points / 16;
361
362		2	__m256 Sum0 = _mm256_loadu_ps(in_ptr);
363		2	in_ptr += 8;
364		2	__m256 Sum1 = _mm256_loadu_ps(in_ptr);
365		2	in_ptr += 8;
366
367		2	__m256 SquareSum0 = _mm256_setzero_ps();
368		2	__m256 SquareSum1 = _mm256_setzero_ps();
369			__m256 Values0, Values1;
370			__m256 Aux0, Aux1;
371			__m256 Reciprocal;
372
373	2/2 ✓ Branch 0 taken 16380 times. ✓ Branch 1 taken 2 times.	16382	for (uint32_t number = 1; number < sixteenth_points; number++) {
374		16380	Values0 = _mm256_loadu_ps(in_ptr);
375		16380	in_ptr += 8;
376		16380	__VOLK_PREFETCH(in_ptr + 8);
377
378		16380	Values1 = _mm256_loadu_ps(in_ptr);
379		16380	in_ptr += 8;
380		16380	__VOLK_PREFETCH(in_ptr + 8);
381
382		16380	float n = (float)number;
383		16380	float n_plus_one = n + 1.f;
384
385		32760	Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
386
387		16380	Sum0 = _mm256_add_ps(Sum0, Values0);
388		16380	Aux0 = _mm256_set1_ps(n_plus_one);
389			SquareSum0 =
390		16380	_mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
391
392		16380	Sum1 = _mm256_add_ps(Sum1, Values1);
393		16380	Aux1 = _mm256_set1_ps(n_plus_one);
394			SquareSum1 =
395		16380	_mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
396			}
397
398			_mm256_store_ps(&SumLocal[0], Sum0);
399			_mm256_store_ps(&SumLocal[8], Sum1);
400			_mm256_store_ps(&SquareSumLocal[0], SquareSum0);
401			_mm256_store_ps(&SquareSumLocal[8], SquareSum1);
402
403		2	accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
404
405		2	uint32_t points_done = sixteenth_points * 16;
406
407	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; points_done < num_points; points_done++) {
408		30	float val = (*in_ptr++);
409		30	SumLocal[0] += val;
410		30	SquareSumLocal[0] =
411		30	update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
412			}
413
414		2	*stddev = sqrtf(SquareSumLocal[0] / num_points);
415		2	*mean = SumLocal[0] / num_points;
416			}
417			#endif /* LV_HAVE_AVX */
418
419			#ifdef LV_HAVE_SSE
420			#include <xmmintrin.h>
421
422		2	static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
423			float* mean,
424			const float* inputBuffer,
425			unsigned int num_points)
426			{
427	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	if (num_points < 8) {
428		✗	volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
429		✗	return;
430			}
431
432		2	const float* in_ptr = inputBuffer;
433
434		2	__VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
435		2	__VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
436
437
438		2	const uint32_t eigth_points = num_points / 8;
439
440		2	__m128 Sum0 = _mm_load_ps(in_ptr);
441		2	in_ptr += 4;
442		2	__m128 Sum1 = _mm_load_ps(in_ptr);
443		2	in_ptr += 4;
444		2	__m128 SquareSum0 = _mm_setzero_ps();
445		2	__m128 SquareSum1 = _mm_setzero_ps();
446			__m128 Values0, Values1;
447			__m128 Aux0, Aux1;
448			__m128 Reciprocal;
449
450	2/2 ✓ Branch 0 taken 32764 times. ✓ Branch 1 taken 2 times.	32766	for (uint32_t number = 1; number < eigth_points; number++) {
451		32764	Values0 = _mm_load_ps(in_ptr);
452		32764	in_ptr += 4;
453		32764	__VOLK_PREFETCH(in_ptr + 4);
454
455		32764	Values1 = _mm_load_ps(in_ptr);
456		32764	in_ptr += 4;
457		32764	__VOLK_PREFETCH(in_ptr + 4);
458
459		32764	float n = (float)number;
460		32764	float n_plus_one = n + 1.f;
461		65528	Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
462
463		32764	Sum0 = _mm_add_ps(Sum0, Values0);
464		32764	Aux0 = _mm_set_ps1(n_plus_one);
465			SquareSum0 =
466		32764	_mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
467
468		32764	Sum1 = _mm_add_ps(Sum1, Values1);
469		32764	Aux1 = _mm_set_ps1(n_plus_one);
470			SquareSum1 =
471		32764	_mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
472			}
473
474			_mm_store_ps(&SumLocal[0], Sum0);
475			_mm_store_ps(&SumLocal[4], Sum1);
476			_mm_store_ps(&SquareSumLocal[0], SquareSum0);
477			_mm_store_ps(&SquareSumLocal[4], SquareSum1);
478
479		2	accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
480
481		2	uint32_t points_done = eigth_points * 8;
482
483	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; points_done < num_points; points_done++) {
484		14	float val = (*in_ptr++);
485		14	SumLocal[0] += val;
486		14	SquareSumLocal[0] =
487		14	update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
488			}
489
490		2	*stddev = sqrtf(SquareSumLocal[0] / num_points);
491		2	*mean = SumLocal[0] / num_points;
492			}
493			#endif /* LV_HAVE_SSE */
494
495			#ifdef LV_HAVE_AVX
496			#include <immintrin.h>
497
498		2	static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
499			float* mean,
500			const float* inputBuffer,
501			unsigned int num_points)
502			{
503	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	if (num_points < 16) {
504		✗	volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
505		✗	return;
506			}
507
508		2	const float* in_ptr = inputBuffer;
509
510		2	__VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
511		2	__VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
512
513		2	const unsigned int sixteenth_points = num_points / 16;
514
515		2	__m256 Sum0 = _mm256_load_ps(in_ptr);
516		2	in_ptr += 8;
517		2	__m256 Sum1 = _mm256_load_ps(in_ptr);
518		2	in_ptr += 8;
519
520		2	__m256 SquareSum0 = _mm256_setzero_ps();
521		2	__m256 SquareSum1 = _mm256_setzero_ps();
522			__m256 Values0, Values1;
523			__m256 Aux0, Aux1;
524			__m256 Reciprocal;
525
526	2/2 ✓ Branch 0 taken 16380 times. ✓ Branch 1 taken 2 times.	16382	for (uint32_t number = 1; number < sixteenth_points; number++) {
527		16380	Values0 = _mm256_load_ps(in_ptr);
528		16380	in_ptr += 8;
529		16380	__VOLK_PREFETCH(in_ptr + 8);
530
531		16380	Values1 = _mm256_load_ps(in_ptr);
532		16380	in_ptr += 8;
533		16380	__VOLK_PREFETCH(in_ptr + 8);
534
535		16380	float n = (float)number;
536		16380	float n_plus_one = n + 1.f;
537
538		32760	Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
539
540		16380	Sum0 = _mm256_add_ps(Sum0, Values0);
541		16380	Aux0 = _mm256_set1_ps(n_plus_one);
542			SquareSum0 =
543		16380	_mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
544
545		16380	Sum1 = _mm256_add_ps(Sum1, Values1);
546		16380	Aux1 = _mm256_set1_ps(n_plus_one);
547			SquareSum1 =
548		16380	_mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
549			}
550
551			_mm256_store_ps(&SumLocal[0], Sum0);
552			_mm256_store_ps(&SumLocal[8], Sum1);
553			_mm256_store_ps(&SquareSumLocal[0], SquareSum0);
554			_mm256_store_ps(&SquareSumLocal[8], SquareSum1);
555
556		2	accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
557
558		2	uint32_t points_done = sixteenth_points * 16;
559
560	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; points_done < num_points; points_done++) {
561		30	float val = (*in_ptr++);
562		30	SumLocal[0] += val;
563		30	SquareSumLocal[0] =
564		30	update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
565			}
566
567		2	*stddev = sqrtf(SquareSumLocal[0] / num_points);
568		2	*mean = SumLocal[0] / num_points;
569			}
570			#endif /* LV_HAVE_AVX */
571
572			#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
573