GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_asin_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	229	229	100.0%
Functions:	7	7	100.0%
Branches:	50	50	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_asin_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes arcsine of input vector and stores results in output vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_asin_32f(float* bVector, const float* aVector, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li bVector: The vector where results will be stored.
    
       *
    
       * \b Example
    
       * \code
    
       * Calculate common angles around the top half of the unit circle.
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   in[0] = 0;
    
       *   in[1] = 0.5;
    
       *   in[2] = std::sqrt(2.f)/2.f;
    
       *   in[3] = std::sqrt(3.f)/2.f;
    
       *   in[4] = in[5] = 1;
    
       *   for(unsigned int ii = 6; ii < N; ++ii){
    
       *       in[ii] = - in[N-ii-1];
    
       *   }
    
       *
    
       *   volk_32f_asin_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("asin(%1.3f) = %1.3f\n", in[ii], out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      /* This is the number of terms of Taylor series to evaluate, increase this for more
    
       * accuracy*/
    
      #define ASIN_TERMS 2
    
      #ifndef INCLUDED_volk_32f_asin_32f_a_H
    
      #define INCLUDED_volk_32f_asin_32f_a_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, pio2, x, y, z, arcsine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      131064
              aVal = _mm256_div_ps(aVal,
    
                                   _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))));
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++) {
    
      196596
                  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
    
              }
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ASIN_TERMS - 1; j >= 0; j--) {
    
      196596
                  y = _mm256_fmadd_ps(
    
      65532
                      y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
              }
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      65532
              y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
    
      32766
              arcsine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      98298
              arcsine = _mm256_sub_ps(arcsine,
    
                                      _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
    
              _mm256_store_ps(bPtr, arcsine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = asin(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, pio2, x, y, z, arcsine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      131064
              aVal = _mm256_div_ps(aVal,
    
                                   _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))));
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++) {
    
      262128
                  x = _mm256_add_ps(x,
    
                                    _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
    
              }
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ASIN_TERMS - 1; j >= 0; j--) {
    
      262128
                  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
    
      65532
                                    _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
              }
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      98298
              y = _mm256_add_ps(
    
                  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
    
      32766
              arcsine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      98298
              arcsine = _mm256_sub_ps(arcsine,
    
                                      _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
    
              _mm256_store_ps(bPtr, arcsine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = asin(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarterPoints = num_points / 4;
    
          int i, j;
    
          __m128 aVal, pio2, x, y, z, arcsine;
    
          __m128 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm_setzero_ps();
    
      2
          fones = _mm_set1_ps(1.0);
    
      2
          ftwos = _mm_set1_ps(2.0);
    
      2
          ffours = _mm_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      262136
              aVal = _mm_div_ps(
    
                  aVal,
    
                  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
    
      65534
              z = aVal;
    
      65534
              condition = _mm_cmplt_ps(z, fzeroes);
    
      196602
              z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
    
      65534
              condition = _mm_cmplt_ps(z, fones);
    
      196602
              x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (i = 0; i < 2; i++) {
    
      524272
                  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
    
              }
    
      65534
              x = _mm_div_ps(fones, x);
    
      65534
              y = fzeroes;
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (j = ASIN_TERMS - 1; j >= 0; j--) {
    
      524272
                  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
    
      131068
                                 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
    
              }
    
      131068
              y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
    
      65534
              condition = _mm_cmpgt_ps(z, fones);
    
      196602
              y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
    
      65534
              arcsine = y;
    
      65534
              condition = _mm_cmplt_ps(aVal, fzeroes);
    
      196602
              arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
    
              _mm_store_ps(bPtr, arcsine);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = asinf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #endif /* INCLUDED_volk_32f_asin_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_asin_32f_u_H
    
      #define INCLUDED_volk_32f_asin_32f_u_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, pio2, x, y, z, arcsine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      131064
              aVal = _mm256_div_ps(aVal,
    
                                   _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))));
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++) {
    
      196596
                  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
    
              }
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ASIN_TERMS - 1; j >= 0; j--) {
    
      196596
                  y = _mm256_fmadd_ps(
    
      65532
                      y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
              }
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      65532
              y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
    
      32766
              arcsine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      98298
              arcsine = _mm256_sub_ps(arcsine,
    
                                      _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
    
              _mm256_storeu_ps(bPtr, arcsine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = asin(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
          int i, j;
    
          __m256 aVal, pio2, x, y, z, arcsine;
    
          __m256 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      131064
              aVal = _mm256_div_ps(aVal,
    
                                   _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
    
                                                                _mm256_sub_ps(fones, aVal))));
    
      32766
              z = aVal;
    
      32766
              condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
    
      65532
              z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
    
      98298
              x = _mm256_add_ps(
    
                  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (i = 0; i < 2; i++) {
    
      262128
                  x = _mm256_add_ps(x,
    
                                    _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
    
              }
    
      32766
              x = _mm256_div_ps(fones, x);
    
      32766
              y = fzeroes;
    
        2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.

      98298
              for (j = ASIN_TERMS - 1; j >= 0; j--) {
    
      262128
                  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
    
      65532
                                    _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
    
              }
    
      32766
              y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
    
      32766
              condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
    
      98298
              y = _mm256_add_ps(
    
                  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
    
      32766
              arcsine = y;
    
      32766
              condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
    
      98298
              arcsine = _mm256_sub_ps(arcsine,
    
                                      _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
    
              _mm256_storeu_ps(bPtr, arcsine);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = asin(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX for unaligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarterPoints = num_points / 4;
    
          int i, j;
    
          __m128 aVal, pio2, x, y, z, arcsine;
    
          __m128 fzeroes, fones, ftwos, ffours, condition;
    
      2
          pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
    
      2
          fzeroes = _mm_setzero_ps();
    
      2
          fones = _mm_set1_ps(1.0);
    
      2
          ftwos = _mm_set1_ps(2.0);
    
      2
          ffours = _mm_set1_ps(4.0);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      262136
              aVal = _mm_div_ps(
    
                  aVal,
    
                  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
    
      65534
              z = aVal;
    
      65534
              condition = _mm_cmplt_ps(z, fzeroes);
    
      196602
              z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
    
      65534
              condition = _mm_cmplt_ps(z, fones);
    
      196602
              x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (i = 0; i < 2; i++) {
    
      524272
                  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
    
              }
    
      65534
              x = _mm_div_ps(fones, x);
    
      65534
              y = fzeroes;
    
        2/2✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.

      196602
              for (j = ASIN_TERMS - 1; j >= 0; j--) {
    
      524272
                  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
    
      131068
                                 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
    
              }
    
      131068
              y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
    
      65534
              condition = _mm_cmpgt_ps(z, fones);
    
      196602
              y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
    
      65534
              arcsine = y;
    
      65534
              condition = _mm_cmplt_ps(aVal, fzeroes);
    
      196602
              arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
    
              _mm_storeu_ps(bPtr, arcsine);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = asinf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for unaligned */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *bPtr++ = asinf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_asin_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_asin_32f
12			*
13			* \b Overview
14			*
15			* Computes arcsine of input vector and stores results in output vector.
16			*
17			* <b>Dispatcher Prototype</b>
18			* \code
19			* void volk_32f_asin_32f(float* bVector, const float* aVector, unsigned int num_points)
20			* \endcode
21			*
22			* \b Inputs
23			* \li aVector: The input vector of floats.
24			* \li num_points: The number of data points.
25			*
26			* \b Outputs
27			* \li bVector: The vector where results will be stored.
28			*
29			* \b Example
30			* \code
31			* Calculate common angles around the top half of the unit circle.
32			* int N = 10;
33			* unsigned int alignment = volk_get_alignment();
34			* float* in = (float)volk_malloc(sizeof(float)N, alignment);
35			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
36			*
37			* in[0] = 0;
38			* in[1] = 0.5;
39			* in[2] = std::sqrt(2.f)/2.f;
40			* in[3] = std::sqrt(3.f)/2.f;
41			* in[4] = in[5] = 1;
42			* for(unsigned int ii = 6; ii < N; ++ii){
43			* in[ii] = - in[N-ii-1];
44			* }
45			*
46			* volk_32f_asin_32f(out, in, N);
47			*
48			* for(unsigned int ii = 0; ii < N; ++ii){
49			* printf("asin(%1.3f) = %1.3f\n", in[ii], out[ii]);
50			* }
51			*
52			* volk_free(in);
53			* volk_free(out);
54			* \endcode
55			*/
56
57			#include <inttypes.h>
58			#include <math.h>
59			#include <stdio.h>
60
61			/* This is the number of terms of Taylor series to evaluate, increase this for more
62			* accuracy*/
63			#define ASIN_TERMS 2
64
65			#ifndef INCLUDED_volk_32f_asin_32f_a_H
66			#define INCLUDED_volk_32f_asin_32f_a_H
67
68			#if LV_HAVE_AVX2 && LV_HAVE_FMA
69			#include <immintrin.h>
70
71		2	static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
72			const float* aVector,
73			unsigned int num_points)
74			{
75		2	float* bPtr = bVector;
76		2	const float* aPtr = aVector;
77
78		2	unsigned int number = 0;
79		2	unsigned int eighthPoints = num_points / 8;
80			int i, j;
81
82			__m256 aVal, pio2, x, y, z, arcsine;
83			__m256 fzeroes, fones, ftwos, ffours, condition;
84
85		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86		2	fzeroes = _mm256_setzero_ps();
87		2	fones = _mm256_set1_ps(1.0);
88		2	ftwos = _mm256_set1_ps(2.0);
89		2	ffours = _mm256_set1_ps(4.0);
90
91	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
92		32766	aVal = _mm256_load_ps(aPtr);
93		131064	aVal = _mm256_div_ps(aVal,
94			_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
95			_mm256_sub_ps(fones, aVal))));
96		32766	z = aVal;
97		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
98		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
99		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
100		98298	x = _mm256_add_ps(
101			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
102
103	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++) {
104		196596	x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
105			}
106		32766	x = _mm256_div_ps(fones, x);
107		32766	y = fzeroes;
108	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ASIN_TERMS - 1; j >= 0; j--) {
109		196596	y = _mm256_fmadd_ps(
110		65532	y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
111			}
112
113		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115
116		65532	y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117		32766	arcsine = y;
118		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119		98298	arcsine = _mm256_sub_ps(arcsine,
120			_mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
121
122			_mm256_store_ps(bPtr, arcsine);
123		32766	aPtr += 8;
124		32766	bPtr += 8;
125			}
126
127		2	number = eighthPoints * 8;
128	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
129		14	bPtr++ = asin(aPtr++);
130			}
131		2	}
132
133			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
134
135
136			#ifdef LV_HAVE_AVX
137			#include <immintrin.h>
138
139			static inline void
140		2	volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
141			{
142		2	float* bPtr = bVector;
143		2	const float* aPtr = aVector;
144
145		2	unsigned int number = 0;
146		2	unsigned int eighthPoints = num_points / 8;
147			int i, j;
148
149			__m256 aVal, pio2, x, y, z, arcsine;
150			__m256 fzeroes, fones, ftwos, ffours, condition;
151
152		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
153		2	fzeroes = _mm256_setzero_ps();
154		2	fones = _mm256_set1_ps(1.0);
155		2	ftwos = _mm256_set1_ps(2.0);
156		2	ffours = _mm256_set1_ps(4.0);
157
158	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
159		32766	aVal = _mm256_load_ps(aPtr);
160		131064	aVal = _mm256_div_ps(aVal,
161			_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
162			_mm256_sub_ps(fones, aVal))));
163		32766	z = aVal;
164		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
165		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
166		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
167		98298	x = _mm256_add_ps(
168			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
169
170	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++) {
171		262128	x = _mm256_add_ps(x,
172			_mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
173			}
174		32766	x = _mm256_div_ps(fones, x);
175		32766	y = fzeroes;
176	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ASIN_TERMS - 1; j >= 0; j--) {
177		262128	y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
178		65532	_mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
179			}
180
181		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
182		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
183
184		98298	y = _mm256_add_ps(
185			y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
186		32766	arcsine = y;
187		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
188		98298	arcsine = _mm256_sub_ps(arcsine,
189			_mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
190
191			_mm256_store_ps(bPtr, arcsine);
192		32766	aPtr += 8;
193		32766	bPtr += 8;
194			}
195
196		2	number = eighthPoints * 8;
197	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
198		14	bPtr++ = asin(aPtr++);
199			}
200		2	}
201
202			#endif /* LV_HAVE_AVX for aligned */
203
204			#ifdef LV_HAVE_SSE4_1
205			#include <smmintrin.h>
206
207			static inline void
208		2	volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
209			{
210		2	float* bPtr = bVector;
211		2	const float* aPtr = aVector;
212
213		2	unsigned int number = 0;
214		2	unsigned int quarterPoints = num_points / 4;
215			int i, j;
216
217			__m128 aVal, pio2, x, y, z, arcsine;
218			__m128 fzeroes, fones, ftwos, ffours, condition;
219
220		2	pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
221		2	fzeroes = _mm_setzero_ps();
222		2	fones = _mm_set1_ps(1.0);
223		2	ftwos = _mm_set1_ps(2.0);
224		2	ffours = _mm_set1_ps(4.0);
225
226	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
227		65534	aVal = _mm_load_ps(aPtr);
228		262136	aVal = _mm_div_ps(
229			aVal,
230			_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
231		65534	z = aVal;
232		65534	condition = _mm_cmplt_ps(z, fzeroes);
233		196602	z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
234		65534	condition = _mm_cmplt_ps(z, fones);
235		196602	x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
236
237	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (i = 0; i < 2; i++) {
238		524272	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
239			}
240		65534	x = _mm_div_ps(fones, x);
241		65534	y = fzeroes;
242	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (j = ASIN_TERMS - 1; j >= 0; j--) {
243		524272	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
244		131068	_mm_set1_ps(pow(-1, j) / (2 * j + 1)));
245			}
246
247		131068	y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
248		65534	condition = _mm_cmpgt_ps(z, fones);
249
250		196602	y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
251		65534	arcsine = y;
252		65534	condition = _mm_cmplt_ps(aVal, fzeroes);
253		196602	arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
254
255			_mm_store_ps(bPtr, arcsine);
256		65534	aPtr += 4;
257		65534	bPtr += 4;
258			}
259
260		2	number = quarterPoints * 4;
261	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
262		6	bPtr++ = asinf(aPtr++);
263			}
264		2	}
265
266			#endif /* LV_HAVE_SSE4_1 for aligned */
267
268			#endif /* INCLUDED_volk_32f_asin_32f_a_H */
269
270			#ifndef INCLUDED_volk_32f_asin_32f_u_H
271			#define INCLUDED_volk_32f_asin_32f_u_H
272
273			#if LV_HAVE_AVX2 && LV_HAVE_FMA
274			#include <immintrin.h>
275
276		2	static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
277			const float* aVector,
278			unsigned int num_points)
279			{
280		2	float* bPtr = bVector;
281		2	const float* aPtr = aVector;
282
283		2	unsigned int number = 0;
284		2	unsigned int eighthPoints = num_points / 8;
285			int i, j;
286
287			__m256 aVal, pio2, x, y, z, arcsine;
288			__m256 fzeroes, fones, ftwos, ffours, condition;
289
290		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
291		2	fzeroes = _mm256_setzero_ps();
292		2	fones = _mm256_set1_ps(1.0);
293		2	ftwos = _mm256_set1_ps(2.0);
294		2	ffours = _mm256_set1_ps(4.0);
295
296	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
297		32766	aVal = _mm256_loadu_ps(aPtr);
298		131064	aVal = _mm256_div_ps(aVal,
299			_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
300			_mm256_sub_ps(fones, aVal))));
301		32766	z = aVal;
302		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
303		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
304		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
305		98298	x = _mm256_add_ps(
306			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
307
308	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++) {
309		196596	x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
310			}
311		32766	x = _mm256_div_ps(fones, x);
312		32766	y = fzeroes;
313	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ASIN_TERMS - 1; j >= 0; j--) {
314		196596	y = _mm256_fmadd_ps(
315		65532	y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
316			}
317
318		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
319		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
320
321		65532	y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
322		32766	arcsine = y;
323		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
324		98298	arcsine = _mm256_sub_ps(arcsine,
325			_mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
326
327			_mm256_storeu_ps(bPtr, arcsine);
328		32766	aPtr += 8;
329		32766	bPtr += 8;
330			}
331
332		2	number = eighthPoints * 8;
333	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
334		14	bPtr++ = asin(aPtr++);
335			}
336		2	}
337
338			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
339
340
341			#ifdef LV_HAVE_AVX
342			#include <immintrin.h>
343
344			static inline void
345		2	volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
346			{
347		2	float* bPtr = bVector;
348		2	const float* aPtr = aVector;
349
350		2	unsigned int number = 0;
351		2	unsigned int eighthPoints = num_points / 8;
352			int i, j;
353
354			__m256 aVal, pio2, x, y, z, arcsine;
355			__m256 fzeroes, fones, ftwos, ffours, condition;
356
357		2	pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
358		2	fzeroes = _mm256_setzero_ps();
359		2	fones = _mm256_set1_ps(1.0);
360		2	ftwos = _mm256_set1_ps(2.0);
361		2	ffours = _mm256_set1_ps(4.0);
362
363	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
364		32766	aVal = _mm256_loadu_ps(aPtr);
365		131064	aVal = _mm256_div_ps(aVal,
366			_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
367			_mm256_sub_ps(fones, aVal))));
368		32766	z = aVal;
369		32766	condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
370		65532	z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
371		32766	condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
372		98298	x = _mm256_add_ps(
373			z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
374
375	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (i = 0; i < 2; i++) {
376		262128	x = _mm256_add_ps(x,
377			_mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
378			}
379		32766	x = _mm256_div_ps(fones, x);
380		32766	y = fzeroes;
381	2/2 ✓ Branch 0 taken 65532 times. ✓ Branch 1 taken 32766 times.	98298	for (j = ASIN_TERMS - 1; j >= 0; j--) {
382		262128	y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
383		65532	_mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
384			}
385
386		32766	y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
387		32766	condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
388
389		98298	y = _mm256_add_ps(
390			y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
391		32766	arcsine = y;
392		32766	condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
393		98298	arcsine = _mm256_sub_ps(arcsine,
394			_mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
395
396			_mm256_storeu_ps(bPtr, arcsine);
397		32766	aPtr += 8;
398		32766	bPtr += 8;
399			}
400
401		2	number = eighthPoints * 8;
402	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
403		14	bPtr++ = asin(aPtr++);
404			}
405		2	}
406
407			#endif /* LV_HAVE_AVX for unaligned */
408
409
410			#ifdef LV_HAVE_SSE4_1
411			#include <smmintrin.h>
412
413			static inline void
414		2	volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
415			{
416		2	float* bPtr = bVector;
417		2	const float* aPtr = aVector;
418
419		2	unsigned int number = 0;
420		2	unsigned int quarterPoints = num_points / 4;
421			int i, j;
422
423			__m128 aVal, pio2, x, y, z, arcsine;
424			__m128 fzeroes, fones, ftwos, ffours, condition;
425
426		2	pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
427		2	fzeroes = _mm_setzero_ps();
428		2	fones = _mm_set1_ps(1.0);
429		2	ftwos = _mm_set1_ps(2.0);
430		2	ffours = _mm_set1_ps(4.0);
431
432	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
433		65534	aVal = _mm_loadu_ps(aPtr);
434		262136	aVal = _mm_div_ps(
435			aVal,
436			_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
437		65534	z = aVal;
438		65534	condition = _mm_cmplt_ps(z, fzeroes);
439		196602	z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
440		65534	condition = _mm_cmplt_ps(z, fones);
441		196602	x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
442
443	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (i = 0; i < 2; i++) {
444		524272	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
445			}
446		65534	x = _mm_div_ps(fones, x);
447		65534	y = fzeroes;
448	2/2 ✓ Branch 0 taken 131068 times. ✓ Branch 1 taken 65534 times.	196602	for (j = ASIN_TERMS - 1; j >= 0; j--) {
449		524272	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
450		131068	_mm_set1_ps(pow(-1, j) / (2 * j + 1)));
451			}
452
453		131068	y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
454		65534	condition = _mm_cmpgt_ps(z, fones);
455
456		196602	y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
457		65534	arcsine = y;
458		65534	condition = _mm_cmplt_ps(aVal, fzeroes);
459		196602	arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
460
461			_mm_storeu_ps(bPtr, arcsine);
462		65534	aPtr += 4;
463		65534	bPtr += 4;
464			}
465
466		2	number = quarterPoints * 4;
467	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
468		6	bPtr++ = asinf(aPtr++);
469			}
470		2	}
471
472			#endif /* LV_HAVE_SSE4_1 for unaligned */
473
474			#ifdef LV_HAVE_GENERIC
475
476			static inline void
477		2	volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
478			{
479		2	float* bPtr = bVector;
480		2	const float* aPtr = aVector;
481		2	unsigned int number = 0;
482
483	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
484		262142	bPtr++ = asinf(aPtr++);
485			}
486		2	}
487			#endif /* LV_HAVE_GENERIC */
488
489			#endif /* INCLUDED_volk_32f_asin_32f_u_H */
490