GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_tan_32f.h
Date:	2023-10-23 23:10:04
	Exec	Total	Coverage
Lines:	317	317	100.0%
Functions:	7	7	100.0%
Branches:	38	38	100.0%
  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_tan_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes the tangent of each element of the aVector.
    
       *
    
       * b[i] = tan(a[i])
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_tan_32f(float* bVector, const float* aVector, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The buffer of points.
    
       * \li num_points: The number of values in input buffer.
    
       *
    
       * \b Outputs
    
       * \li bVector: The output buffer.
    
       *
    
       * \b Example
    
       * Calculate tan(theta) for common angles.
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   in[0] = 0.000;
    
       *   in[1] = 0.524;
    
       *   in[2] = 0.785;
    
       *   in[3] = 1.047;
    
       *   in[4] = 1.571  ;
    
       *   in[5] = 1.571  ;
    
       *   in[6] = -1.047;
    
       *   in[7] = -0.785;
    
       *   in[8] = -0.524;
    
       *   in[9] = -0.000;
    
       *
    
       *   volk_32f_tan_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("tan(%1.3f) = %1.3f\n", in[ii], out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #ifndef INCLUDED_volk_32f_tan_32f_a_H
    
      #define INCLUDED_volk_32f_tan_32f_a_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
      2
          unsigned int i = 0;
    
          __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
    
              fzeroes;
    
          __m256 sine, cosine, tangent, condition1, condition2, condition3;
    
          __m256i q, r, ones, twos, fours;
    
      2
          m4pi = _mm256_set1_ps(1.273239545);
    
      2
          pio4A = _mm256_set1_ps(0.78515625);
    
      2
          pio4B = _mm256_set1_ps(0.241876e-3);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          ones = _mm256_set1_epi32(1);
    
      2
          twos = _mm256_set1_epi32(2);
    
      2
          fours = _mm256_set1_epi32(4);
    
      2
          cp1 = _mm256_set1_ps(1.0);
    
      2
          cp2 = _mm256_set1_ps(0.83333333e-1);
    
      2
          cp3 = _mm256_set1_ps(0.2777778e-2);
    
      2
          cp4 = _mm256_set1_ps(0.49603e-4);
    
      2
          cp5 = _mm256_set1_ps(0.551e-6);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      98298
              s = _mm256_sub_ps(aVal,
    
                                _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
    
      32766
                                              _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
    
      65532
              q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
    
      65532
              r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
    
      65532
              s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
    
      65532
              s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
    
      65532
              s = _mm256_div_ps(
    
                  s,
    
                  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
    
      32766
              s = _mm256_mul_ps(s, s);
    
              // Evaluate Taylor series
    
      131064
              s = _mm256_mul_ps(
    
                  _mm256_fmadd_ps(
    
                      _mm256_fmsub_ps(
    
                          _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
    
                      s,
    
                      cp1),
    
                  s);
    
        2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.

      131064
              for (i = 0; i < 3; i++) {
    
      196596
                  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
    
              }
    
      32766
              s = _mm256_div_ps(s, ftwos);
    
      98298
              sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
    
      32766
              cosine = _mm256_sub_ps(fones, s);
    
      65532
              condition1 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      98298
              condition2 = _mm256_cmp_ps(
    
                  _mm256_cmp_ps(
    
                      _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
    
                  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
    
                  _CMP_NEQ_UQ);
    
      65532
              condition3 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      32766
              __m256 temp = cosine;
    
              cosine =
    
      98298
                  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
    
      98298
              sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
    
      131064
              sine = _mm256_sub_ps(
    
                  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
    
      131064
              cosine = _mm256_sub_ps(
    
                  cosine,
    
                  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
    
      32766
              tangent = _mm256_div_ps(sine, cosine);
    
              _mm256_store_ps(bPtr, tangent);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = tan(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
      2
          unsigned int i = 0;
    
          __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
    
              fzeroes;
    
          __m256 sine, cosine, tangent, condition1, condition2, condition3;
    
          __m256i q, r, ones, twos, fours;
    
      2
          m4pi = _mm256_set1_ps(1.273239545);
    
      2
          pio4A = _mm256_set1_ps(0.78515625);
    
      2
          pio4B = _mm256_set1_ps(0.241876e-3);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          ones = _mm256_set1_epi32(1);
    
      2
          twos = _mm256_set1_epi32(2);
    
      2
          fours = _mm256_set1_epi32(4);
    
      2
          cp1 = _mm256_set1_ps(1.0);
    
      2
          cp2 = _mm256_set1_ps(0.83333333e-1);
    
      2
          cp3 = _mm256_set1_ps(0.2777778e-2);
    
      2
          cp4 = _mm256_set1_ps(0.49603e-4);
    
      2
          cp5 = _mm256_set1_ps(0.551e-6);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      98298
              s = _mm256_sub_ps(aVal,
    
                                _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
    
      32766
                                              _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
    
      65532
              q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
    
      65532
              r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
    
      98298
              s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
    
      98298
              s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
    
      65532
              s = _mm256_div_ps(
    
                  s,
    
                  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
    
      32766
              s = _mm256_mul_ps(s, s);
    
              // Evaluate Taylor series
    
      262128
              s = _mm256_mul_ps(
    
                  _mm256_add_ps(
    
                      _mm256_mul_ps(
    
                          _mm256_sub_ps(
    
                              _mm256_mul_ps(
    
                                  _mm256_add_ps(
    
                                      _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
    
                                                    s),
    
                                      cp3),
    
                                  s),
    
                              cp2),
    
                          s),
    
                      cp1),
    
                  s);
    
        2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.

      131064
              for (i = 0; i < 3; i++) {
    
      196596
                  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
    
              }
    
      32766
              s = _mm256_div_ps(s, ftwos);
    
      98298
              sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
    
      32766
              cosine = _mm256_sub_ps(fones, s);
    
      65532
              condition1 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      98298
              condition2 = _mm256_cmp_ps(
    
                  _mm256_cmp_ps(
    
                      _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
    
                  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
    
                  _CMP_NEQ_UQ);
    
      65532
              condition3 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      32766
              __m256 temp = cosine;
    
              cosine =
    
      98298
                  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
    
      98298
              sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
    
      131064
              sine = _mm256_sub_ps(
    
                  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
    
      131064
              cosine = _mm256_sub_ps(
    
                  cosine,
    
                  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
    
      32766
              tangent = _mm256_div_ps(sine, cosine);
    
              _mm256_store_ps(bPtr, tangent);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = tan(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarterPoints = num_points / 4;
    
      2
          unsigned int i = 0;
    
          __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
    
              fzeroes;
    
          __m128 sine, cosine, tangent, condition1, condition2, condition3;
    
          __m128i q, r, ones, twos, fours;
    
      2
          m4pi = _mm_set1_ps(1.273239545);
    
      2
          pio4A = _mm_set1_ps(0.78515625);
    
      2
          pio4B = _mm_set1_ps(0.241876e-3);
    
      2
          ffours = _mm_set1_ps(4.0);
    
      2
          ftwos = _mm_set1_ps(2.0);
    
      2
          fones = _mm_set1_ps(1.0);
    
      2
          fzeroes = _mm_setzero_ps();
    
      2
          ones = _mm_set1_epi32(1);
    
      2
          twos = _mm_set1_epi32(2);
    
      2
          fours = _mm_set1_epi32(4);
    
      2
          cp1 = _mm_set1_ps(1.0);
    
      2
          cp2 = _mm_set1_ps(0.83333333e-1);
    
      2
          cp3 = _mm_set1_ps(0.2777778e-2);
    
      2
          cp4 = _mm_set1_ps(0.49603e-4);
    
      2
          cp5 = _mm_set1_ps(0.551e-6);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      262136
              s = _mm_sub_ps(aVal,
    
                             _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
    
      131068
              q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
    
      131068
              r = _mm_add_epi32(q, _mm_and_si128(q, ones));
    
      196602
              s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
    
      196602
              s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
    
      131068
              s = _mm_div_ps(
    
                  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
    
      65534
              s = _mm_mul_ps(s, s);
    
              // Evaluate Taylor series
    
      524272
              s = _mm_mul_ps(
    
                  _mm_add_ps(
    
                      _mm_mul_ps(
    
                          _mm_sub_ps(
    
                              _mm_mul_ps(
    
                                  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
    
                                             cp3),
    
                                  s),
    
                              cp2),
    
                          s),
    
                      cp1),
    
                  s);
    
        2/2✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.

      262136
              for (i = 0; i < 3; i++) {
    
      393204
                  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
    
              }
    
      65534
              s = _mm_div_ps(s, ftwos);
    
      196602
              sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
    
      65534
              cosine = _mm_sub_ps(fones, s);
    
      262136
              condition1 = _mm_cmpneq_ps(
    
                  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
    
      327670
              condition2 = _mm_cmpneq_ps(
    
                  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
    
                  _mm_cmplt_ps(aVal, fzeroes));
    
      196602
              condition3 = _mm_cmpneq_ps(
    
                  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
    
      65534
              __m128 temp = cosine;
    
      196602
              cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
    
      196602
              sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
    
              sine =
    
      262136
                  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
    
      262136
              cosine = _mm_sub_ps(
    
                  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
    
      65534
              tangent = _mm_div_ps(sine, cosine);
    
              _mm_store_ps(bPtr, tangent);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = tanf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #endif /* INCLUDED_volk_32f_tan_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_tan_32f_u_H
    
      #define INCLUDED_volk_32f_tan_32f_u_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
      2
          unsigned int i = 0;
    
          __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
    
              fzeroes;
    
          __m256 sine, cosine, tangent, condition1, condition2, condition3;
    
          __m256i q, r, ones, twos, fours;
    
      2
          m4pi = _mm256_set1_ps(1.273239545);
    
      2
          pio4A = _mm256_set1_ps(0.78515625);
    
      2
          pio4B = _mm256_set1_ps(0.241876e-3);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          ones = _mm256_set1_epi32(1);
    
      2
          twos = _mm256_set1_epi32(2);
    
      2
          fours = _mm256_set1_epi32(4);
    
      2
          cp1 = _mm256_set1_ps(1.0);
    
      2
          cp2 = _mm256_set1_ps(0.83333333e-1);
    
      2
          cp3 = _mm256_set1_ps(0.2777778e-2);
    
      2
          cp4 = _mm256_set1_ps(0.49603e-4);
    
      2
          cp5 = _mm256_set1_ps(0.551e-6);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      98298
              s = _mm256_sub_ps(aVal,
    
                                _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
    
      32766
                                              _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
    
      65532
              q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
    
      65532
              r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
    
      65532
              s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
    
      65532
              s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
    
      65532
              s = _mm256_div_ps(
    
                  s,
    
                  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
    
      32766
              s = _mm256_mul_ps(s, s);
    
              // Evaluate Taylor series
    
      131064
              s = _mm256_mul_ps(
    
                  _mm256_fmadd_ps(
    
                      _mm256_fmsub_ps(
    
                          _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
    
                      s,
    
                      cp1),
    
                  s);
    
        2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.

      131064
              for (i = 0; i < 3; i++) {
    
      196596
                  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
    
              }
    
      32766
              s = _mm256_div_ps(s, ftwos);
    
      98298
              sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
    
      32766
              cosine = _mm256_sub_ps(fones, s);
    
      65532
              condition1 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      98298
              condition2 = _mm256_cmp_ps(
    
                  _mm256_cmp_ps(
    
                      _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
    
                  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
    
                  _CMP_NEQ_UQ);
    
      65532
              condition3 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      32766
              __m256 temp = cosine;
    
              cosine =
    
      98298
                  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
    
      98298
              sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
    
      131064
              sine = _mm256_sub_ps(
    
                  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
    
      131064
              cosine = _mm256_sub_ps(
    
                  cosine,
    
                  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
    
      32766
              tangent = _mm256_div_ps(sine, cosine);
    
              _mm256_storeu_ps(bPtr, tangent);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = tan(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
      2
          unsigned int i = 0;
    
          __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
    
              fzeroes;
    
          __m256 sine, cosine, tangent, condition1, condition2, condition3;
    
          __m256i q, r, ones, twos, fours;
    
      2
          m4pi = _mm256_set1_ps(1.273239545);
    
      2
          pio4A = _mm256_set1_ps(0.78515625);
    
      2
          pio4B = _mm256_set1_ps(0.241876e-3);
    
      2
          ffours = _mm256_set1_ps(4.0);
    
      2
          ftwos = _mm256_set1_ps(2.0);
    
      2
          fones = _mm256_set1_ps(1.0);
    
      2
          fzeroes = _mm256_setzero_ps();
    
      2
          ones = _mm256_set1_epi32(1);
    
      2
          twos = _mm256_set1_epi32(2);
    
      2
          fours = _mm256_set1_epi32(4);
    
      2
          cp1 = _mm256_set1_ps(1.0);
    
      2
          cp2 = _mm256_set1_ps(0.83333333e-1);
    
      2
          cp3 = _mm256_set1_ps(0.2777778e-2);
    
      2
          cp4 = _mm256_set1_ps(0.49603e-4);
    
      2
          cp5 = _mm256_set1_ps(0.551e-6);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      98298
              s = _mm256_sub_ps(aVal,
    
                                _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
    
      32766
                                              _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
    
      65532
              q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
    
      65532
              r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
    
      98298
              s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
    
      98298
              s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
    
      65532
              s = _mm256_div_ps(
    
                  s,
    
                  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
    
      32766
              s = _mm256_mul_ps(s, s);
    
              // Evaluate Taylor series
    
      262128
              s = _mm256_mul_ps(
    
                  _mm256_add_ps(
    
                      _mm256_mul_ps(
    
                          _mm256_sub_ps(
    
                              _mm256_mul_ps(
    
                                  _mm256_add_ps(
    
                                      _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
    
                                                    s),
    
                                      cp3),
    
                                  s),
    
                              cp2),
    
                          s),
    
                      cp1),
    
                  s);
    
        2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.

      131064
              for (i = 0; i < 3; i++) {
    
      196596
                  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
    
              }
    
      32766
              s = _mm256_div_ps(s, ftwos);
    
      98298
              sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
    
      32766
              cosine = _mm256_sub_ps(fones, s);
    
      65532
              condition1 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      98298
              condition2 = _mm256_cmp_ps(
    
                  _mm256_cmp_ps(
    
                      _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
    
                  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
    
                  _CMP_NEQ_UQ);
    
      65532
              condition3 = _mm256_cmp_ps(
    
                  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
    
                  fzeroes,
    
                  _CMP_NEQ_UQ);
    
      32766
              __m256 temp = cosine;
    
              cosine =
    
      98298
                  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
    
      98298
              sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
    
      131064
              sine = _mm256_sub_ps(
    
                  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
    
      131064
              cosine = _mm256_sub_ps(
    
                  cosine,
    
                  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
    
      32766
              tangent = _mm256_div_ps(sine, cosine);
    
              _mm256_storeu_ps(bPtr, tangent);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = tan(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 for unaligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarterPoints = num_points / 4;
    
      2
          unsigned int i = 0;
    
          __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
    
              fzeroes;
    
          __m128 sine, cosine, tangent, condition1, condition2, condition3;
    
          __m128i q, r, ones, twos, fours;
    
      2
          m4pi = _mm_set1_ps(1.273239545);
    
      2
          pio4A = _mm_set1_ps(0.78515625);
    
      2
          pio4B = _mm_set1_ps(0.241876e-3);
    
      2
          ffours = _mm_set1_ps(4.0);
    
      2
          ftwos = _mm_set1_ps(2.0);
    
      2
          fones = _mm_set1_ps(1.0);
    
      2
          fzeroes = _mm_setzero_ps();
    
      2
          ones = _mm_set1_epi32(1);
    
      2
          twos = _mm_set1_epi32(2);
    
      2
          fours = _mm_set1_epi32(4);
    
      2
          cp1 = _mm_set1_ps(1.0);
    
      2
          cp2 = _mm_set1_ps(0.83333333e-1);
    
      2
          cp3 = _mm_set1_ps(0.2777778e-2);
    
      2
          cp4 = _mm_set1_ps(0.49603e-4);
    
      2
          cp5 = _mm_set1_ps(0.551e-6);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      262136
              s = _mm_sub_ps(aVal,
    
                             _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
    
      131068
              q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
    
      131068
              r = _mm_add_epi32(q, _mm_and_si128(q, ones));
    
      196602
              s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
    
      196602
              s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
    
      131068
              s = _mm_div_ps(
    
                  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
    
      65534
              s = _mm_mul_ps(s, s);
    
              // Evaluate Taylor series
    
      524272
              s = _mm_mul_ps(
    
                  _mm_add_ps(
    
                      _mm_mul_ps(
    
                          _mm_sub_ps(
    
                              _mm_mul_ps(
    
                                  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
    
                                             cp3),
    
                                  s),
    
                              cp2),
    
                          s),
    
                      cp1),
    
                  s);
    
        2/2✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.

      262136
              for (i = 0; i < 3; i++) {
    
      393204
                  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
    
              }
    
      65534
              s = _mm_div_ps(s, ftwos);
    
      196602
              sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
    
      65534
              cosine = _mm_sub_ps(fones, s);
    
      262136
              condition1 = _mm_cmpneq_ps(
    
                  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
    
      327670
              condition2 = _mm_cmpneq_ps(
    
                  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
    
                  _mm_cmplt_ps(aVal, fzeroes));
    
      196602
              condition3 = _mm_cmpneq_ps(
    
                  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
    
      65534
              __m128 temp = cosine;
    
      196602
              cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
    
      196602
              sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
    
              sine =
    
      262136
                  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
    
      262136
              cosine = _mm_sub_ps(
    
                  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
    
      65534
              tangent = _mm_div_ps(sine, cosine);
    
              _mm_storeu_ps(bPtr, tangent);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = tanf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for unaligned */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; number++) {
    
      262142
              *bPtr++ = tanf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      #include <volk/volk_neon_intrinsics.h>
    
      static inline void
    
      volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
          unsigned int number = 0;
    
          unsigned int quarter_points = num_points / 4;
    
          float* bVectorPtr = bVector;
    
          const float* aVectorPtr = aVector;
    
          float32x4_t b_vec;
    
          float32x4_t a_vec;
    
          for (number = 0; number < quarter_points; number++) {
    
              a_vec = vld1q_f32(aVectorPtr);
    
              // Prefetch next one, speeds things up
    
              __VOLK_PREFETCH(aVectorPtr + 4);
    
              b_vec = _vtanq_f32(a_vec);
    
              vst1q_f32(bVectorPtr, b_vec);
    
              // move pointers ahead
    
              bVectorPtr += 4;
    
              aVectorPtr += 4;
    
          }
    
          // Deal with the rest
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *bVectorPtr++ = tanf(*aVectorPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #endif /* INCLUDED_volk_32f_tan_32f_u_H */