GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_acos_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	253	253	100.0%
Functions:	7	7	100.0%
Branches:	50	50	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_acos_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes arccosine of the input vector and stores results in the output vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_acos_32f(float* bVector, const float* aVector, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li bVector: The vector where results will be stored.
    
       *
    
       * \b Example
    
       * Calculate common angles around the top half of the unit circle.
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   in[0] = 1;
    
       *   in[1] = std::sqrt(3.f)/2.f;
    
       *   in[2] = std::sqrt(2.f)/2.f;
    
       *   in[3] = 0.5;
    
       *   in[4] = in[5] = 0;
    
       *   for(unsigned int ii = 6; ii < N; ++ii){
    
       *       in[ii] = - in[N-ii-1];
    
       *   }
    
       *
    
       *   volk_32f_acos_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("acos(%1.3f) = %1.3f\n", in[ii], out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      /* This is the number of terms of Taylor series to evaluate, increase this for more
    
       * accuracy*/
    
      #define ACOS_TERMS 2
    
      #ifndef INCLUDED_volk_32f_acos_32f_a_H
    
      #define INCLUDED_volk_32f_acos_32f_a_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, d, pi, pio2, x, y, z, arccosine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pi = _mm256_set1_ps(3.14159265358979323846);
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              d = aVal;
    
      131064
              aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))),
    
                                   aVal);
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++)
    
      196596
                  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ACOS_TERMS - 1; j >= 0; j--)
    
      196596
                  y = _mm256_fmadd_ps(
    
      65532
                      y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      65532
              y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
    
      32766
              arccosine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_sub_ps(
    
                  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
    
              _mm256_store_ps(bPtr, arccosine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = acos(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, d, pi, pio2, x, y, z, arccosine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pi = _mm256_set1_ps(3.14159265358979323846);
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              d = aVal;
    
      131064
              aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))),
    
                                   aVal);
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++)
    
      262128
                  x = _mm256_add_ps(x,
    
                                    _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ACOS_TERMS - 1; j >= 0; j--)
    
      262128
                  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
    
      65532
                                    _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      98298
              y = _mm256_add_ps(
    
                  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
    
      32766
              arccosine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_sub_ps(
    
                  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
    
              _mm256_store_ps(bPtr, arccosine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = acos(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarterPoints = num_points / 4;
    
          int i, j;
    
          __m128 aVal, d, pi, pio2, x, y, z, arccosine;
    
          __m128 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pi = _mm_set1_ps(3.14159265358979323846);
    
      2
          pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm_setzero_ps();
    
      2
          fones = _mm_set1_ps(1.0);
    
      2
          ftwos = _mm_set1_ps(2.0);
    
      2
          ffours = _mm_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              d = aVal;
    
      262136
              aVal = _mm_div_ps(
    
                  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
    
                  aVal);
    
      65534
              z = aVal;
    
      65534
              condition = _mm_cmplt_ps(z, fzeroes);
    
      196602
              z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
    
      65534
              condition = _mm_cmplt_ps(z, fones);
    
      196602
              x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (i = 0; i < 2; i++)
    
      524272
                  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
    
      65534
              x = _mm_div_ps(fones, x);
    
      65534
              y = fzeroes;
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (j = ACOS_TERMS - 1; j >= 0; j--)
    
      524272
                  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
    
      131068
                                 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
    
      131068
              y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
    
      65534
              condition = _mm_cmpgt_ps(z, fones);
    
      196602
              y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
    
      65534
              arccosine = y;
    
      65534
              condition = _mm_cmplt_ps(aVal, fzeroes);
    
              arccosine =
    
      196602
                  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
    
      65534
              condition = _mm_cmplt_ps(d, fzeroes);
    
      131068
              arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
    
              _mm_store_ps(bPtr, arccosine);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = acosf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #endif /* INCLUDED_volk_32f_acos_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_acos_32f_u_H
    
      #define INCLUDED_volk_32f_acos_32f_u_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, d, pi, pio2, x, y, z, arccosine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pi = _mm256_set1_ps(3.14159265358979323846);
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              d = aVal;
    
      131064
              aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))),
    
                                   aVal);
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++)
    
      196596
                  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ACOS_TERMS - 1; j >= 0; j--)
    
      196596
                  y = _mm256_fmadd_ps(
    
      65532
                      y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      65532
              y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
    
      32766
              arccosine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_sub_ps(
    
                  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
    
              _mm256_storeu_ps(bPtr, arccosine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = acos(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, d, pi, pio2, x, y, z, arccosine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pi = _mm256_set1_ps(3.14159265358979323846);
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              d = aVal;
    
      131064
              aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))),
    
                                   aVal);
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++)
    
      262128
                  x = _mm256_add_ps(x,
    
                                    _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ACOS_TERMS - 1; j >= 0; j--)
    
      262128
                  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
    
      65532
                                    _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      98298
              y = _mm256_add_ps(
    
                  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
    
      32766
              arccosine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_sub_ps(
    
                  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
    
      65532
              arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
    
              _mm256_storeu_ps(bPtr, arccosine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = acos(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 for unaligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarterPoints = num_points / 4;
    
          int i, j;
    
          __m128 aVal, d, pi, pio2, x, y, z, arccosine;
    
          __m128 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pi = _mm_set1_ps(3.14159265358979323846);
    
      2
          pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm_setzero_ps();
    
      2
          fones = _mm_set1_ps(1.0);
    
      2
          ftwos = _mm_set1_ps(2.0);
    
      2
          ffours = _mm_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      65534
              d = aVal;
    
      262136
              aVal = _mm_div_ps(
    
                  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
    
                  aVal);
    
      65534
              z = aVal;
    
      65534
              condition = _mm_cmplt_ps(z, fzeroes);
    
      196602
              z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
    
      65534
              condition = _mm_cmplt_ps(z, fones);
    
      196602
              x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (i = 0; i < 2; i++)
    
      524272
                  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
    
      65534
              x = _mm_div_ps(fones, x);
    
      65534
              y = fzeroes;
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (j = ACOS_TERMS - 1; j >= 0; j--)
    
      524272
                  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
    
      131068
                                 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
    
      131068
              y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
    
      65534
              condition = _mm_cmpgt_ps(z, fones);
    
      196602
              y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
    
      65534
              arccosine = y;
    
      65534
              condition = _mm_cmplt_ps(aVal, fzeroes);
    
              arccosine =
    
      196602
                  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
    
      65534
              condition = _mm_cmplt_ps(d, fzeroes);
    
      131068
              arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
    
              _mm_storeu_ps(bPtr, arccosine);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = acosf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *bPtr++ = acosf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_acos_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_acos_32f
12			*
13			* \b Overview
14			*
15			* Computes arccosine of the input vector and stores results in the output vector.
16			*
17			* <b>Dispatcher Prototype</b>
18			* \code
19			* void volk_32f_acos_32f(float* bVector, const float* aVector, unsigned int num_points)
20			* \endcode
21			*
22			* \b Inputs
23			* \li aVector: The input vector of floats.
24			* \li num_points: The number of data points.
25			*
26			* \b Outputs
27			* \li bVector: The vector where results will be stored.
28			*
29			* \b Example
30			* Calculate common angles around the top half of the unit circle.
31			* \code
32			* int N = 10;
33			* unsigned int alignment = volk_get_alignment();
34			* float* in = (float)volk_malloc(sizeof(float)N, alignment);
35			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
36			*
37			* in[0] = 1;
38			* in[1] = std::sqrt(3.f)/2.f;
39			* in[2] = std::sqrt(2.f)/2.f;
40			* in[3] = 0.5;
41			* in[4] = in[5] = 0;
42			* for(unsigned int ii = 6; ii < N; ++ii){
43			* in[ii] = - in[N-ii-1];
44			* }
45			*
46			* volk_32f_acos_32f(out, in, N);
47			*
48			* for(unsigned int ii = 0; ii < N; ++ii){
49			* printf("acos(%1.3f) = %1.3f\n", in[ii], out[ii]);
50			* }
51			*
52			* volk_free(in);
53			* volk_free(out);
54			* \endcode
55			*/
56
57			#include <inttypes.h>
58			#include <math.h>
59			#include <stdio.h>
60
61			/* This is the number of terms of Taylor series to evaluate, increase this for more
62			* accuracy*/
63			#define ACOS_TERMS 2
64
65			#ifndef INCLUDED_volk_32f_acos_32f_a_H
66			#define INCLUDED_volk_32f_acos_32f_a_H
67
68			#if LV_HAVE_AVX2 && LV_HAVE_FMA
69			#include <immintrin.h>
70
71		2	static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
72			const float* aVector,
73			unsigned int num_points)
74			{
75		2	float* bPtr = bVector;
76		2	const float* aPtr = aVector;
77
78		2	unsigned int number = 0;
79		2	unsigned int eighthPoints = num_points / 8;
80			int i, j;
81
82			__m256 aVal, d, pi, pio2, x, y, z, arccosine;
83			__m256 fzeroes, fones, ftwos, ffours, condition;
84
85		2	pi = _mm256_set1_ps(3.14159265358979323846);
86		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87		2	fzeroes = _mm256_setzero_ps();
88		2	fones = _mm256_set1_ps(1.0);
89		2	ftwos = _mm256_set1_ps(2.0);
90		2	ffours = _mm256_set1_ps(4.0);
91
92	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
93		32766	aVal = _mm256_load_ps(aPtr);
94		32766	d = aVal;
95		131064	aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96			_mm256_sub_ps(fones, aVal))),
97			aVal);
98		32766	z = aVal;
99		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
102		98298	x = _mm256_add_ps(
103			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
104
105	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++)
106		196596	x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107		32766	x = _mm256_div_ps(fones, x);
108		32766	y = fzeroes;
109	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ACOS_TERMS - 1; j >= 0; j--)
110		196596	y = _mm256_fmadd_ps(
111		65532	y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
112
113		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115
116		65532	y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117		32766	arccosine = y;
118		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119		65532	arccosine = _mm256_sub_ps(
120			arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
121		32766	condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
122		65532	arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
123
124			_mm256_store_ps(bPtr, arccosine);
125		32766	aPtr += 8;
126		32766	bPtr += 8;
127			}
128
129		2	number = eighthPoints * 8;
130	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
131		14	bPtr++ = acos(aPtr++);
132			}
133		2	}
134
135			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
136
137
138			#ifdef LV_HAVE_AVX
139			#include <immintrin.h>
140
141			static inline void
142		2	volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
143			{
144		2	float* bPtr = bVector;
145		2	const float* aPtr = aVector;
146
147		2	unsigned int number = 0;
148		2	unsigned int eighthPoints = num_points / 8;
149			int i, j;
150
151			__m256 aVal, d, pi, pio2, x, y, z, arccosine;
152			__m256 fzeroes, fones, ftwos, ffours, condition;
153
154		2	pi = _mm256_set1_ps(3.14159265358979323846);
155		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
156		2	fzeroes = _mm256_setzero_ps();
157		2	fones = _mm256_set1_ps(1.0);
158		2	ftwos = _mm256_set1_ps(2.0);
159		2	ffours = _mm256_set1_ps(4.0);
160
161	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
162		32766	aVal = _mm256_load_ps(aPtr);
163		32766	d = aVal;
164		131064	aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
165			_mm256_sub_ps(fones, aVal))),
166			aVal);
167		32766	z = aVal;
168		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
171		98298	x = _mm256_add_ps(
172			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
173
174	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++)
175		262128	x = _mm256_add_ps(x,
176			_mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177		32766	x = _mm256_div_ps(fones, x);
178		32766	y = fzeroes;
179	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ACOS_TERMS - 1; j >= 0; j--)
180		262128	y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
181		65532	_mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
182
183		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
184		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185
186		98298	y = _mm256_add_ps(
187			y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
188		32766	arccosine = y;
189		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190		65532	arccosine = _mm256_sub_ps(
191			arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
192		32766	condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
193		65532	arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
194
195			_mm256_store_ps(bPtr, arccosine);
196		32766	aPtr += 8;
197		32766	bPtr += 8;
198			}
199
200		2	number = eighthPoints * 8;
201	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
202		14	bPtr++ = acos(aPtr++);
203			}
204		2	}
205
206			#endif /* LV_HAVE_AVX2 for aligned */
207
208			#ifdef LV_HAVE_SSE4_1
209			#include <smmintrin.h>
210
211			static inline void
212		2	volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
213			{
214		2	float* bPtr = bVector;
215		2	const float* aPtr = aVector;
216
217		2	unsigned int number = 0;
218		2	unsigned int quarterPoints = num_points / 4;
219			int i, j;
220
221			__m128 aVal, d, pi, pio2, x, y, z, arccosine;
222			__m128 fzeroes, fones, ftwos, ffours, condition;
223
224		2	pi = _mm_set1_ps(3.14159265358979323846);
225		2	pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
226		2	fzeroes = _mm_setzero_ps();
227		2	fones = _mm_set1_ps(1.0);
228		2	ftwos = _mm_set1_ps(2.0);
229		2	ffours = _mm_set1_ps(4.0);
230
231	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
232		65534	aVal = _mm_load_ps(aPtr);
233		65534	d = aVal;
234		262136	aVal = _mm_div_ps(
235			_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
236			aVal);
237		65534	z = aVal;
238		65534	condition = _mm_cmplt_ps(z, fzeroes);
239		196602	z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
240		65534	condition = _mm_cmplt_ps(z, fones);
241		196602	x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
242
243	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (i = 0; i < 2; i++)
244		524272	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
245		65534	x = _mm_div_ps(fones, x);
246		65534	y = fzeroes;
247	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (j = ACOS_TERMS - 1; j >= 0; j--)
248		524272	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
249		131068	_mm_set1_ps(pow(-1, j) / (2 * j + 1)));
250
251		131068	y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252		65534	condition = _mm_cmpgt_ps(z, fones);
253
254		196602	y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255		65534	arccosine = y;
256		65534	condition = _mm_cmplt_ps(aVal, fzeroes);
257			arccosine =
258		196602	_mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
259		65534	condition = _mm_cmplt_ps(d, fzeroes);
260		131068	arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
261
262			_mm_store_ps(bPtr, arccosine);
263		65534	aPtr += 4;
264		65534	bPtr += 4;
265			}
266
267		2	number = quarterPoints * 4;
268	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
269		6	bPtr++ = acosf(aPtr++);
270			}
271		2	}
272
273			#endif /* LV_HAVE_SSE4_1 for aligned */
274
275			#endif /* INCLUDED_volk_32f_acos_32f_a_H */
276
277
278			#ifndef INCLUDED_volk_32f_acos_32f_u_H
279			#define INCLUDED_volk_32f_acos_32f_u_H
280
281			#if LV_HAVE_AVX2 && LV_HAVE_FMA
282			#include <immintrin.h>
283
284		2	static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
285			const float* aVector,
286			unsigned int num_points)
287			{
288		2	float* bPtr = bVector;
289		2	const float* aPtr = aVector;
290
291		2	unsigned int number = 0;
292		2	unsigned int eighthPoints = num_points / 8;
293			int i, j;
294
295			__m256 aVal, d, pi, pio2, x, y, z, arccosine;
296			__m256 fzeroes, fones, ftwos, ffours, condition;
297
298		2	pi = _mm256_set1_ps(3.14159265358979323846);
299		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
300		2	fzeroes = _mm256_setzero_ps();
301		2	fones = _mm256_set1_ps(1.0);
302		2	ftwos = _mm256_set1_ps(2.0);
303		2	ffours = _mm256_set1_ps(4.0);
304
305	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
306		32766	aVal = _mm256_loadu_ps(aPtr);
307		32766	d = aVal;
308		131064	aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
309			_mm256_sub_ps(fones, aVal))),
310			aVal);
311		32766	z = aVal;
312		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
313		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
314		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
315		98298	x = _mm256_add_ps(
316			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
317
318	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++)
319		196596	x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
320		32766	x = _mm256_div_ps(fones, x);
321		32766	y = fzeroes;
322	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ACOS_TERMS - 1; j >= 0; j--)
323		196596	y = _mm256_fmadd_ps(
324		65532	y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
325
326		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
327		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
328
329		65532	y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
330		32766	arccosine = y;
331		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
332		65532	arccosine = _mm256_sub_ps(
333			arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
334		32766	condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
335		65532	arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
336
337			_mm256_storeu_ps(bPtr, arccosine);
338		32766	aPtr += 8;
339		32766	bPtr += 8;
340			}
341
342		2	number = eighthPoints * 8;
343	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
344		14	bPtr++ = acos(aPtr++);
345			}
346		2	}
347
348			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
349
350
351			#ifdef LV_HAVE_AVX
352			#include <immintrin.h>
353
354			static inline void
355		2	volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
356			{
357		2	float* bPtr = bVector;
358		2	const float* aPtr = aVector;
359
360		2	unsigned int number = 0;
361		2	unsigned int eighthPoints = num_points / 8;
362			int i, j;
363
364			__m256 aVal, d, pi, pio2, x, y, z, arccosine;
365			__m256 fzeroes, fones, ftwos, ffours, condition;
366
367		2	pi = _mm256_set1_ps(3.14159265358979323846);
368		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
369		2	fzeroes = _mm256_setzero_ps();
370		2	fones = _mm256_set1_ps(1.0);
371		2	ftwos = _mm256_set1_ps(2.0);
372		2	ffours = _mm256_set1_ps(4.0);
373
374	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
375		32766	aVal = _mm256_loadu_ps(aPtr);
376		32766	d = aVal;
377		131064	aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
378			_mm256_sub_ps(fones, aVal))),
379			aVal);
380		32766	z = aVal;
381		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
382		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
383		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
384		98298	x = _mm256_add_ps(
385			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
386
387	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++)
388		262128	x = _mm256_add_ps(x,
389			_mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
390		32766	x = _mm256_div_ps(fones, x);
391		32766	y = fzeroes;
392	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ACOS_TERMS - 1; j >= 0; j--)
393		262128	y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
394		65532	_mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
395
396		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
397		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
398
399		98298	y = _mm256_add_ps(
400			y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
401		32766	arccosine = y;
402		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
403		65532	arccosine = _mm256_sub_ps(
404			arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
405		32766	condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
406		65532	arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
407
408			_mm256_storeu_ps(bPtr, arccosine);
409		32766	aPtr += 8;
410		32766	bPtr += 8;
411			}
412
413		2	number = eighthPoints * 8;
414	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
415		14	bPtr++ = acos(aPtr++);
416			}
417		2	}
418
419			#endif /* LV_HAVE_AVX2 for unaligned */
420
421			#ifdef LV_HAVE_SSE4_1
422			#include <smmintrin.h>
423
424			static inline void
425		2	volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
426			{
427		2	float* bPtr = bVector;
428		2	const float* aPtr = aVector;
429
430		2	unsigned int number = 0;
431		2	unsigned int quarterPoints = num_points / 4;
432			int i, j;
433
434			__m128 aVal, d, pi, pio2, x, y, z, arccosine;
435			__m128 fzeroes, fones, ftwos, ffours, condition;
436
437		2	pi = _mm_set1_ps(3.14159265358979323846);
438		2	pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
439		2	fzeroes = _mm_setzero_ps();
440		2	fones = _mm_set1_ps(1.0);
441		2	ftwos = _mm_set1_ps(2.0);
442		2	ffours = _mm_set1_ps(4.0);
443
444	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
445		65534	aVal = _mm_loadu_ps(aPtr);
446		65534	d = aVal;
447		262136	aVal = _mm_div_ps(
448			_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
449			aVal);
450		65534	z = aVal;
451		65534	condition = _mm_cmplt_ps(z, fzeroes);
452		196602	z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453		65534	condition = _mm_cmplt_ps(z, fones);
454		196602	x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
455
456	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (i = 0; i < 2; i++)
457		524272	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
458		65534	x = _mm_div_ps(fones, x);
459		65534	y = fzeroes;
460
461	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (j = ACOS_TERMS - 1; j >= 0; j--)
462		524272	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463		131068	_mm_set1_ps(pow(-1, j) / (2 * j + 1)));
464
465		131068	y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
466		65534	condition = _mm_cmpgt_ps(z, fones);
467
468		196602	y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
469		65534	arccosine = y;
470		65534	condition = _mm_cmplt_ps(aVal, fzeroes);
471			arccosine =
472		196602	_mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
473		65534	condition = _mm_cmplt_ps(d, fzeroes);
474		131068	arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
475
476			_mm_storeu_ps(bPtr, arccosine);
477		65534	aPtr += 4;
478		65534	bPtr += 4;
479			}
480
481		2	number = quarterPoints * 4;
482	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
483		6	bPtr++ = acosf(aPtr++);
484			}
485		2	}
486
487			#endif /* LV_HAVE_SSE4_1 for aligned */
488
489			#ifdef LV_HAVE_GENERIC
490
491			static inline void
492		2	volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
493			{
494		2	float* bPtr = bVector;
495		2	const float* aPtr = aVector;
496		2	unsigned int number = 0;
497
498	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
499		262142	bPtr++ = acosf(aPtr++);
500			}
501		2	}
502			#endif /* LV_HAVE_GENERIC */
503
504			#endif /* INCLUDED_volk_32f_acos_32f_u_H */
505