GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_x2_pow_32f.h
Date:	2023-10-23 23:10:04
	Exec	Total	Coverage
Lines:	360	360	100.0%
Functions:	7	7	100.0%
Branches:	26	26	100.0%
  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_x2_pow_32f
    
       *
    
       * \b Overview
    
       *
    
       * Raises the sample in aVector to the power of the number in bVector.
    
       *
    
       * c[i] = pow(a[i], b[i])
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector,
    
       * unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li bVector: The input vector of indices (power values).
    
       * \li aVector: The input vector of base values.
    
       * \li num_points: The number of values in both input vectors.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output vector.
    
       *
    
       * \b Example
    
       * Calculate the first two powers of two (2^x).
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* twos = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = (float)ii;
    
       *       twos[ii] = 2.f;
    
       *   }
    
       *
    
       *   volk_32f_x2_pow_32f(out, increasing, twos, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %1.2f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(twos);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
    
      #define INCLUDED_volk_32f_x2_pow_32f_a_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <stdlib.h>
    
      #define POW_POLY_DEGREE 3
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
    
      #define POLY1_AVX2_FMA(x, c0, c1) \
    
          _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
    
      #define POLY2_AVX2_FMA(x, c0, c1, c2) \
    
          _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
    
      #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
    
          _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
    
      #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
    
          _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
    
      #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
    
          _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
    
      2
      static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
    
                                                        const float* bVector,
    
                                                        const float* aVector,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
    
          __m256 tmp, fx, mask, pow2n, z, y;
    
          __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
    
          __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
    
          __m256i bias, exp, emm0, pi32_0x7f;
    
      2
          one = _mm256_set1_ps(1.0);
    
      2
          exp_hi = _mm256_set1_ps(88.3762626647949);
    
      2
          exp_lo = _mm256_set1_ps(-88.3762626647949);
    
      2
          ln2 = _mm256_set1_ps(0.6931471805);
    
      2
          log2EF = _mm256_set1_ps(1.44269504088896341);
    
      2
          half = _mm256_set1_ps(0.5);
    
      2
          exp_C1 = _mm256_set1_ps(0.693359375);
    
      2
          exp_C2 = _mm256_set1_ps(-2.12194440e-4);
    
      2
          pi32_0x7f = _mm256_set1_epi32(0x7f);
    
      2
          exp_p0 = _mm256_set1_ps(1.9875691500e-4);
    
      2
          exp_p1 = _mm256_set1_ps(1.3981999507e-3);
    
      2
          exp_p2 = _mm256_set1_ps(8.3334519073e-3);
    
      2
          exp_p3 = _mm256_set1_ps(4.1665795894e-2);
    
      2
          exp_p4 = _mm256_set1_ps(1.6666665459e-1);
    
      2
          exp_p5 = _mm256_set1_ps(5.0000001201e-1);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
              // First compute the logarithm
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              bias = _mm256_set1_epi32(127);
    
      32766
              leadingOne = _mm256_set1_ps(1.0f);
    
      163830
              exp = _mm256_sub_epi32(
    
                  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
    
                                                     _mm256_set1_epi32(0x7f800000)),
    
                                    23),
    
                  bias);
    
      32766
              logarithm = _mm256_cvtepi32_ps(exp);
    
      131064
              frac = _mm256_or_ps(
    
                  leadingOne,
    
                  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
    
      #if POW_POLY_DEGREE == 6
    
              mantissa = POLY5_AVX2_FMA(frac,
    
                                        3.1157899f,
    
                                        -3.3241990f,
    
                                        2.5988452f,
    
                                        -1.2315303f,
    
                                        3.1821337e-1f,
    
                                        -3.4436006e-2f);
    
      #elif POW_POLY_DEGREE == 5
    
              mantissa = POLY4_AVX2_FMA(frac,
    
                                        2.8882704548164776201f,
    
                                        -2.52074962577807006663f,
    
                                        1.48116647521213171641f,
    
                                        -0.465725644288844778798f,
    
                                        0.0596515482674574969533f);
    
      #elif POW_POLY_DEGREE == 4
    
              mantissa = POLY3_AVX2_FMA(frac,
    
                                        2.61761038894603480148f,
    
                                        -1.75647175389045657003f,
    
                                        0.688243882994381274313f,
    
                                        -0.107254423828329604454f);
    
      #elif POW_POLY_DEGREE == 3
    
      163830
              mantissa = POLY2_AVX2_FMA(frac,
    
                                        2.28330284476918490682f,
    
                                        -1.04913055217340124191f,
    
                                        0.204446009836232697516f);
    
      #else
    
      #error
    
      #endif
    
      65532
              logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
    
      32766
              logarithm = _mm256_mul_ps(logarithm, ln2);
    
              // Now calculate b*lna
    
      32766
              bVal = _mm256_load_ps(bPtr);
    
      32766
              bVal = _mm256_mul_ps(bVal, logarithm);
    
              // Now compute exp(b*lna)
    
      65532
              bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
    
      32766
              fx = _mm256_fmadd_ps(bVal, log2EF, half);
    
      32766
              emm0 = _mm256_cvttps_epi32(fx);
    
      32766
              tmp = _mm256_cvtepi32_ps(emm0);
    
      65532
              mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
    
      32766
              fx = _mm256_sub_ps(tmp, mask);
    
      32766
              tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
    
      32766
              bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
    
      32766
              z = _mm256_mul_ps(bVal, bVal);
    
      32766
              y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p2);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p3);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p4);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p5);
    
      32766
              y = _mm256_fmadd_ps(y, z, bVal);
    
      32766
              y = _mm256_add_ps(y, one);
    
              emm0 =
    
      98298
                  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
    
      32766
              pow2n = _mm256_castsi256_ps(emm0);
    
      32766
              cVal = _mm256_mul_ps(y, pow2n);
    
              _mm256_store_ps(cPtr, cVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = pow(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
    
      #define POLY1_AVX2(x, c0, c1) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
    
      #define POLY2_AVX2(x, c0, c1, c2) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
    
      #define POLY3_AVX2(x, c0, c1, c2, c3) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
    
      #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
    
      #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
    
      2
      static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
    
                                                    const float* bVector,
    
                                                    const float* aVector,
    
                                                    unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
    
          __m256 tmp, fx, mask, pow2n, z, y;
    
          __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
    
          __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
    
          __m256i bias, exp, emm0, pi32_0x7f;
    
      2
          one = _mm256_set1_ps(1.0);
    
      2
          exp_hi = _mm256_set1_ps(88.3762626647949);
    
      2
          exp_lo = _mm256_set1_ps(-88.3762626647949);
    
      2
          ln2 = _mm256_set1_ps(0.6931471805);
    
      2
          log2EF = _mm256_set1_ps(1.44269504088896341);
    
      2
          half = _mm256_set1_ps(0.5);
    
      2
          exp_C1 = _mm256_set1_ps(0.693359375);
    
      2
          exp_C2 = _mm256_set1_ps(-2.12194440e-4);
    
      2
          pi32_0x7f = _mm256_set1_epi32(0x7f);
    
      2
          exp_p0 = _mm256_set1_ps(1.9875691500e-4);
    
      2
          exp_p1 = _mm256_set1_ps(1.3981999507e-3);
    
      2
          exp_p2 = _mm256_set1_ps(8.3334519073e-3);
    
      2
          exp_p3 = _mm256_set1_ps(4.1665795894e-2);
    
      2
          exp_p4 = _mm256_set1_ps(1.6666665459e-1);
    
      2
          exp_p5 = _mm256_set1_ps(5.0000001201e-1);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
              // First compute the logarithm
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              bias = _mm256_set1_epi32(127);
    
      32766
              leadingOne = _mm256_set1_ps(1.0f);
    
      163830
              exp = _mm256_sub_epi32(
    
                  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
    
                                                     _mm256_set1_epi32(0x7f800000)),
    
                                    23),
    
                  bias);
    
      32766
              logarithm = _mm256_cvtepi32_ps(exp);
    
      131064
              frac = _mm256_or_ps(
    
                  leadingOne,
    
                  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
    
      #if POW_POLY_DEGREE == 6
    
              mantissa = POLY5_AVX2(frac,
    
                                    3.1157899f,
    
                                    -3.3241990f,
    
                                    2.5988452f,
    
                                    -1.2315303f,
    
                                    3.1821337e-1f,
    
                                    -3.4436006e-2f);
    
      #elif POW_POLY_DEGREE == 5
    
              mantissa = POLY4_AVX2(frac,
    
                                    2.8882704548164776201f,
    
                                    -2.52074962577807006663f,
    
                                    1.48116647521213171641f,
    
                                    -0.465725644288844778798f,
    
                                    0.0596515482674574969533f);
    
      #elif POW_POLY_DEGREE == 4
    
              mantissa = POLY3_AVX2(frac,
    
                                    2.61761038894603480148f,
    
                                    -1.75647175389045657003f,
    
                                    0.688243882994381274313f,
    
                                    -0.107254423828329604454f);
    
      #elif POW_POLY_DEGREE == 3
    
      229362
              mantissa = POLY2_AVX2(frac,
    
                                    2.28330284476918490682f,
    
                                    -1.04913055217340124191f,
    
                                    0.204446009836232697516f);
    
      #else
    
      #error
    
      #endif
    
      98298
              logarithm = _mm256_add_ps(
    
                  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
    
      32766
              logarithm = _mm256_mul_ps(logarithm, ln2);
    
              // Now calculate b*lna
    
      32766
              bVal = _mm256_load_ps(bPtr);
    
      32766
              bVal = _mm256_mul_ps(bVal, logarithm);
    
              // Now compute exp(b*lna)
    
      98298
              bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
    
      65532
              fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
    
      32766
              emm0 = _mm256_cvttps_epi32(fx);
    
      32766
              tmp = _mm256_cvtepi32_ps(emm0);
    
      65532
              mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
    
      32766
              fx = _mm256_sub_ps(tmp, mask);
    
      65532
              tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
    
      65532
              bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
    
      32766
              z = _mm256_mul_ps(bVal, bVal);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
    
      32766
              y = _mm256_add_ps(y, one);
    
              emm0 =
    
      98298
                  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
    
      32766
              pow2n = _mm256_castsi256_ps(emm0);
    
      32766
              cVal = _mm256_mul_ps(y, pow2n);
    
              _mm256_store_ps(cPtr, cVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = pow(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      #define POLY0(x, c0) _mm_set1_ps(c0)
    
      #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
    
      #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
    
      #define POLY3(x, c0, c1, c2, c3) \
    
          _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
    
      #define POLY4(x, c0, c1, c2, c3, c4) \
    
          _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
    
      #define POLY5(x, c0, c1, c2, c3, c4, c5) \
    
          _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
    
      2
      static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
    
                                                      const float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
    
          __m128 tmp, fx, mask, pow2n, z, y;
    
          __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
    
          __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
    
          __m128i bias, exp, emm0, pi32_0x7f;
    
      2
          one = _mm_set1_ps(1.0);
    
      2
          exp_hi = _mm_set1_ps(88.3762626647949);
    
      2
          exp_lo = _mm_set1_ps(-88.3762626647949);
    
      2
          ln2 = _mm_set1_ps(0.6931471805);
    
      2
          log2EF = _mm_set1_ps(1.44269504088896341);
    
      2
          half = _mm_set1_ps(0.5);
    
      2
          exp_C1 = _mm_set1_ps(0.693359375);
    
      2
          exp_C2 = _mm_set1_ps(-2.12194440e-4);
    
      2
          pi32_0x7f = _mm_set1_epi32(0x7f);
    
      2
          exp_p0 = _mm_set1_ps(1.9875691500e-4);
    
      2
          exp_p1 = _mm_set1_ps(1.3981999507e-3);
    
      2
          exp_p2 = _mm_set1_ps(8.3334519073e-3);
    
      2
          exp_p3 = _mm_set1_ps(4.1665795894e-2);
    
      2
          exp_p4 = _mm_set1_ps(1.6666665459e-1);
    
      2
          exp_p5 = _mm_set1_ps(5.0000001201e-1);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
              // First compute the logarithm
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              bias = _mm_set1_epi32(127);
    
      65534
              leadingOne = _mm_set1_ps(1.0f);
    
      327670
              exp = _mm_sub_epi32(
    
                  _mm_srli_epi32(
    
                      _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
    
                  bias);
    
      65534
              logarithm = _mm_cvtepi32_ps(exp);
    
      262136
              frac = _mm_or_ps(leadingOne,
    
                               _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
    
      #if POW_POLY_DEGREE == 6
    
              mantissa = POLY5(frac,
    
                               3.1157899f,
    
                               -3.3241990f,
    
                               2.5988452f,
    
                               -1.2315303f,
    
                               3.1821337e-1f,
    
                               -3.4436006e-2f);
    
      #elif POW_POLY_DEGREE == 5
    
              mantissa = POLY4(frac,
    
                               2.8882704548164776201f,
    
                               -2.52074962577807006663f,
    
                               1.48116647521213171641f,
    
                               -0.465725644288844778798f,
    
                               0.0596515482674574969533f);
    
      #elif POW_POLY_DEGREE == 4
    
              mantissa = POLY3(frac,
    
                               2.61761038894603480148f,
    
                               -1.75647175389045657003f,
    
                               0.688243882994381274313f,
    
                               -0.107254423828329604454f);
    
      #elif POW_POLY_DEGREE == 3
    
      458738
              mantissa = POLY2(frac,
    
                               2.28330284476918490682f,
    
                               -1.04913055217340124191f,
    
                               0.204446009836232697516f);
    
      #else
    
      #error
    
      #endif
    
              logarithm =
    
      196602
                  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
    
      65534
              logarithm = _mm_mul_ps(logarithm, ln2);
    
              // Now calculate b*lna
    
      65534
              bVal = _mm_load_ps(bPtr);
    
      65534
              bVal = _mm_mul_ps(bVal, logarithm);
    
              // Now compute exp(b*lna)
    
      131068
              bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
    
      131068
              fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
    
      65534
              emm0 = _mm_cvttps_epi32(fx);
    
      65534
              tmp = _mm_cvtepi32_ps(emm0);
    
      131068
              mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
    
      65534
              fx = _mm_sub_ps(tmp, mask);
    
      65534
              tmp = _mm_mul_ps(fx, exp_C1);
    
      65534
              z = _mm_mul_ps(fx, exp_C2);
    
      131068
              bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
    
      65534
              z = _mm_mul_ps(bVal, bVal);
    
      196602
              y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
    
      196602
              y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
    
      196602
              y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
    
      196602
              y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
    
      65534
              y = _mm_add_ps(y, one);
    
      196602
              emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
    
      65534
              pow2n = _mm_castsi128_ps(emm0);
    
      65534
              cVal = _mm_mul_ps(y, pow2n);
    
              _mm_store_ps(cPtr, cVal);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = powf(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
    
      #define INCLUDED_volk_32f_x2_pow_32f_u_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <stdlib.h>
    
      #define POW_POLY_DEGREE 3
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_x2_pow_32f_generic(float* cVector,
    
                                                     const float* bVector,
    
                                                     const float* aVector,
    
                                                     unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = powf(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      #define POLY0(x, c0) _mm_set1_ps(c0)
    
      #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
    
      #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
    
      #define POLY3(x, c0, c1, c2, c3) \
    
          _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
    
      #define POLY4(x, c0, c1, c2, c3, c4) \
    
          _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
    
      #define POLY5(x, c0, c1, c2, c3, c4, c5) \
    
          _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
    
      2
      static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
    
                                                      const float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
    
          __m128 tmp, fx, mask, pow2n, z, y;
    
          __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
    
          __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
    
          __m128i bias, exp, emm0, pi32_0x7f;
    
      2
          one = _mm_set1_ps(1.0);
    
      2
          exp_hi = _mm_set1_ps(88.3762626647949);
    
      2
          exp_lo = _mm_set1_ps(-88.3762626647949);
    
      2
          ln2 = _mm_set1_ps(0.6931471805);
    
      2
          log2EF = _mm_set1_ps(1.44269504088896341);
    
      2
          half = _mm_set1_ps(0.5);
    
      2
          exp_C1 = _mm_set1_ps(0.693359375);
    
      2
          exp_C2 = _mm_set1_ps(-2.12194440e-4);
    
      2
          pi32_0x7f = _mm_set1_epi32(0x7f);
    
      2
          exp_p0 = _mm_set1_ps(1.9875691500e-4);
    
      2
          exp_p1 = _mm_set1_ps(1.3981999507e-3);
    
      2
          exp_p2 = _mm_set1_ps(8.3334519073e-3);
    
      2
          exp_p3 = _mm_set1_ps(4.1665795894e-2);
    
      2
          exp_p4 = _mm_set1_ps(1.6666665459e-1);
    
      2
          exp_p5 = _mm_set1_ps(5.0000001201e-1);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
              // First compute the logarithm
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      65534
              bias = _mm_set1_epi32(127);
    
      65534
              leadingOne = _mm_set1_ps(1.0f);
    
      327670
              exp = _mm_sub_epi32(
    
                  _mm_srli_epi32(
    
                      _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
    
                  bias);
    
      65534
              logarithm = _mm_cvtepi32_ps(exp);
    
      262136
              frac = _mm_or_ps(leadingOne,
    
                               _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
    
      #if POW_POLY_DEGREE == 6
    
              mantissa = POLY5(frac,
    
                               3.1157899f,
    
                               -3.3241990f,
    
                               2.5988452f,
    
                               -1.2315303f,
    
                               3.1821337e-1f,
    
                               -3.4436006e-2f);
    
      #elif POW_POLY_DEGREE == 5
    
              mantissa = POLY4(frac,
    
                               2.8882704548164776201f,
    
                               -2.52074962577807006663f,
    
                               1.48116647521213171641f,
    
                               -0.465725644288844778798f,
    
                               0.0596515482674574969533f);
    
      #elif POW_POLY_DEGREE == 4
    
              mantissa = POLY3(frac,
    
                               2.61761038894603480148f,
    
                               -1.75647175389045657003f,
    
                               0.688243882994381274313f,
    
                               -0.107254423828329604454f);
    
      #elif POW_POLY_DEGREE == 3
    
      458738
              mantissa = POLY2(frac,
    
                               2.28330284476918490682f,
    
                               -1.04913055217340124191f,
    
                               0.204446009836232697516f);
    
      #else
    
      #error
    
      #endif
    
              logarithm =
    
      196602
                  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
    
      65534
              logarithm = _mm_mul_ps(logarithm, ln2);
    
              // Now calculate b*lna
    
      65534
              bVal = _mm_loadu_ps(bPtr);
    
      65534
              bVal = _mm_mul_ps(bVal, logarithm);
    
              // Now compute exp(b*lna)
    
      131068
              bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
    
      131068
              fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
    
      65534
              emm0 = _mm_cvttps_epi32(fx);
    
      65534
              tmp = _mm_cvtepi32_ps(emm0);
    
      131068
              mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
    
      65534
              fx = _mm_sub_ps(tmp, mask);
    
      65534
              tmp = _mm_mul_ps(fx, exp_C1);
    
      65534
              z = _mm_mul_ps(fx, exp_C2);
    
      131068
              bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
    
      65534
              z = _mm_mul_ps(bVal, bVal);
    
      196602
              y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
    
      196602
              y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
    
      196602
              y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
    
      196602
              y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
    
      65534
              y = _mm_add_ps(y, one);
    
      196602
              emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
    
      65534
              pow2n = _mm_castsi128_ps(emm0);
    
      65534
              cVal = _mm_mul_ps(y, pow2n);
    
              _mm_storeu_ps(cPtr, cVal);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = powf(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for unaligned */
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
    
      #define POLY1_AVX2_FMA(x, c0, c1) \
    
          _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
    
      #define POLY2_AVX2_FMA(x, c0, c1, c2) \
    
          _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
    
      #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
    
          _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
    
      #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
    
          _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
    
      #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
    
          _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
    
      2
      static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
    
                                                        const float* bVector,
    
                                                        const float* aVector,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
    
          __m256 tmp, fx, mask, pow2n, z, y;
    
          __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
    
          __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
    
          __m256i bias, exp, emm0, pi32_0x7f;
    
      2
          one = _mm256_set1_ps(1.0);
    
      2
          exp_hi = _mm256_set1_ps(88.3762626647949);
    
      2
          exp_lo = _mm256_set1_ps(-88.3762626647949);
    
      2
          ln2 = _mm256_set1_ps(0.6931471805);
    
      2
          log2EF = _mm256_set1_ps(1.44269504088896341);
    
      2
          half = _mm256_set1_ps(0.5);
    
      2
          exp_C1 = _mm256_set1_ps(0.693359375);
    
      2
          exp_C2 = _mm256_set1_ps(-2.12194440e-4);
    
      2
          pi32_0x7f = _mm256_set1_epi32(0x7f);
    
      2
          exp_p0 = _mm256_set1_ps(1.9875691500e-4);
    
      2
          exp_p1 = _mm256_set1_ps(1.3981999507e-3);
    
      2
          exp_p2 = _mm256_set1_ps(8.3334519073e-3);
    
      2
          exp_p3 = _mm256_set1_ps(4.1665795894e-2);
    
      2
          exp_p4 = _mm256_set1_ps(1.6666665459e-1);
    
      2
          exp_p5 = _mm256_set1_ps(5.0000001201e-1);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
              // First compute the logarithm
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              bias = _mm256_set1_epi32(127);
    
      32766
              leadingOne = _mm256_set1_ps(1.0f);
    
      163830
              exp = _mm256_sub_epi32(
    
                  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
    
                                                     _mm256_set1_epi32(0x7f800000)),
    
                                    23),
    
                  bias);
    
      32766
              logarithm = _mm256_cvtepi32_ps(exp);
    
      131064
              frac = _mm256_or_ps(
    
                  leadingOne,
    
                  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
    
      #if POW_POLY_DEGREE == 6
    
              mantissa = POLY5_AVX2_FMA(frac,
    
                                        3.1157899f,
    
                                        -3.3241990f,
    
                                        2.5988452f,
    
                                        -1.2315303f,
    
                                        3.1821337e-1f,
    
                                        -3.4436006e-2f);
    
      #elif POW_POLY_DEGREE == 5
    
              mantissa = POLY4_AVX2_FMA(frac,
    
                                        2.8882704548164776201f,
    
                                        -2.52074962577807006663f,
    
                                        1.48116647521213171641f,
    
                                        -0.465725644288844778798f,
    
                                        0.0596515482674574969533f);
    
      #elif POW_POLY_DEGREE == 4
    
              mantissa = POLY3_AVX2_FMA(frac,
    
                                        2.61761038894603480148f,
    
                                        -1.75647175389045657003f,
    
                                        0.688243882994381274313f,
    
                                        -0.107254423828329604454f);
    
      #elif POW_POLY_DEGREE == 3
    
      163830
              mantissa = POLY2_AVX2_FMA(frac,
    
                                        2.28330284476918490682f,
    
                                        -1.04913055217340124191f,
    
                                        0.204446009836232697516f);
    
      #else
    
      #error
    
      #endif
    
      65532
              logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
    
      32766
              logarithm = _mm256_mul_ps(logarithm, ln2);
    
              // Now calculate b*lna
    
      32766
              bVal = _mm256_loadu_ps(bPtr);
    
      32766
              bVal = _mm256_mul_ps(bVal, logarithm);
    
              // Now compute exp(b*lna)
    
      65532
              bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
    
      32766
              fx = _mm256_fmadd_ps(bVal, log2EF, half);
    
      32766
              emm0 = _mm256_cvttps_epi32(fx);
    
      32766
              tmp = _mm256_cvtepi32_ps(emm0);
    
      65532
              mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
    
      32766
              fx = _mm256_sub_ps(tmp, mask);
    
      32766
              tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
    
      32766
              bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
    
      32766
              z = _mm256_mul_ps(bVal, bVal);
    
      32766
              y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p2);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p3);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p4);
    
      32766
              y = _mm256_fmadd_ps(y, bVal, exp_p5);
    
      32766
              y = _mm256_fmadd_ps(y, z, bVal);
    
      32766
              y = _mm256_add_ps(y, one);
    
              emm0 =
    
      98298
                  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
    
      32766
              pow2n = _mm256_castsi256_ps(emm0);
    
      32766
              cVal = _mm256_mul_ps(y, pow2n);
    
              _mm256_storeu_ps(cPtr, cVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = pow(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
    
      #define POLY1_AVX2(x, c0, c1) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
    
      #define POLY2_AVX2(x, c0, c1, c2) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
    
      #define POLY3_AVX2(x, c0, c1, c2, c3) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
    
      #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
    
      #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
    
          _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
    
      2
      static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
    
                                                    const float* bVector,
    
                                                    const float* aVector,
    
                                                    unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
    
          __m256 tmp, fx, mask, pow2n, z, y;
    
          __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
    
          __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
    
          __m256i bias, exp, emm0, pi32_0x7f;
    
      2
          one = _mm256_set1_ps(1.0);
    
      2
          exp_hi = _mm256_set1_ps(88.3762626647949);
    
      2
          exp_lo = _mm256_set1_ps(-88.3762626647949);
    
      2
          ln2 = _mm256_set1_ps(0.6931471805);
    
      2
          log2EF = _mm256_set1_ps(1.44269504088896341);
    
      2
          half = _mm256_set1_ps(0.5);
    
      2
          exp_C1 = _mm256_set1_ps(0.693359375);
    
      2
          exp_C2 = _mm256_set1_ps(-2.12194440e-4);
    
      2
          pi32_0x7f = _mm256_set1_epi32(0x7f);
    
      2
          exp_p0 = _mm256_set1_ps(1.9875691500e-4);
    
      2
          exp_p1 = _mm256_set1_ps(1.3981999507e-3);
    
      2
          exp_p2 = _mm256_set1_ps(8.3334519073e-3);
    
      2
          exp_p3 = _mm256_set1_ps(4.1665795894e-2);
    
      2
          exp_p4 = _mm256_set1_ps(1.6666665459e-1);
    
      2
          exp_p5 = _mm256_set1_ps(5.0000001201e-1);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
              // First compute the logarithm
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              bias = _mm256_set1_epi32(127);
    
      32766
              leadingOne = _mm256_set1_ps(1.0f);
    
      163830
              exp = _mm256_sub_epi32(
    
                  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
    
                                                     _mm256_set1_epi32(0x7f800000)),
    
                                    23),
    
                  bias);
    
      32766
              logarithm = _mm256_cvtepi32_ps(exp);
    
      131064
              frac = _mm256_or_ps(
    
                  leadingOne,
    
                  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
    
      #if POW_POLY_DEGREE == 6
    
              mantissa = POLY5_AVX2(frac,
    
                                    3.1157899f,
    
                                    -3.3241990f,
    
                                    2.5988452f,
    
                                    -1.2315303f,
    
                                    3.1821337e-1f,
    
                                    -3.4436006e-2f);
    
      #elif POW_POLY_DEGREE == 5
    
              mantissa = POLY4_AVX2(frac,
    
                                    2.8882704548164776201f,
    
                                    -2.52074962577807006663f,
    
                                    1.48116647521213171641f,
    
                                    -0.465725644288844778798f,
    
                                    0.0596515482674574969533f);
    
      #elif POW_POLY_DEGREE == 4
    
              mantissa = POLY3_AVX2(frac,
    
                                    2.61761038894603480148f,
    
                                    -1.75647175389045657003f,
    
                                    0.688243882994381274313f,
    
                                    -0.107254423828329604454f);
    
      #elif POW_POLY_DEGREE == 3
    
      229362
              mantissa = POLY2_AVX2(frac,
    
                                    2.28330284476918490682f,
    
                                    -1.04913055217340124191f,
    
                                    0.204446009836232697516f);
    
      #else
    
      #error
    
      #endif
    
      98298
              logarithm = _mm256_add_ps(
    
                  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
    
      32766
              logarithm = _mm256_mul_ps(logarithm, ln2);
    
              // Now calculate b*lna
    
      32766
              bVal = _mm256_loadu_ps(bPtr);
    
      32766
              bVal = _mm256_mul_ps(bVal, logarithm);
    
              // Now compute exp(b*lna)
    
      65532
              bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
    
      65532
              fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
    
      32766
              emm0 = _mm256_cvttps_epi32(fx);
    
      32766
              tmp = _mm256_cvtepi32_ps(emm0);
    
      65532
              mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
    
      32766
              fx = _mm256_sub_ps(tmp, mask);
    
      65532
              tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
    
      65532
              bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
    
      32766
              z = _mm256_mul_ps(bVal, bVal);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
    
      65532
              y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
    
      32766
              y = _mm256_add_ps(y, one);
    
              emm0 =
    
      98298
                  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
    
      32766
              pow2n = _mm256_castsi256_ps(emm0);
    
      32766
              cVal = _mm256_mul_ps(y, pow2n);
    
              _mm256_storeu_ps(cPtr, cVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = pow(*aPtr++, *bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 for unaligned */
    
      #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */