GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_tanh_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	151	153	98.7%
Functions:	8	8	100.0%
Branches:	18	20	90.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_tanh_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes the hyperbolic tangent of each element of the aVector:
    
       *
    
       * c[i] = tanh(a[i])
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The buffer of points.
    
       * \li num_points: The number of values in input buffer.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output buffer.
    
       *
    
       * \b Example
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       // the approximate artanh(x) for x<1
    
       *       float x = (float)ii / (float)N;
    
       *       in[ii] = 0.5 * std::log((1.f+x)/(1.f-x));
    
       *   }
    
       *
    
       *   volk_32f_tanh_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out(%i) = %f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_tanh_32f_a_H
    
      #define INCLUDED_volk_32f_tanh_32f_a_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <string.h>
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; number++) {
    
      262142
              *cPtr++ = tanhf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      14
      volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      14
          float* cPtr = cVector;
    
      14
          const float* aPtr = aVector;
    
        2/2✓ Branch 0 taken 262210 times.
✓ Branch 1 taken 14 times.

      262224
          for (unsigned int number = 0; number < num_points; number++) {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262210 times.

      262210
              if (*aPtr > 4.97)
    
      ✗
                  *cPtr++ = 1;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262210 times.

      262210
              else if (*aPtr <= -4.97)
    
      ✗
                  *cPtr++ = -1;
    
              else {
    
      262210
                  float x2 = (*aPtr) * (*aPtr);
    
      262210
                  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
    
      262210
                  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
    
      262210
                  *cPtr++ = a / b;
    
      262210
                  aPtr++;
    
              }
    
          }
    
      14
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      static inline void
    
      2
      volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m128 aVal, cVal, x2, a, b;
    
          __m128 const1, const2, const3, const4, const5, const6;
    
      2
          const1 = _mm_set_ps1(135135.0f);
    
      2
          const2 = _mm_set_ps1(17325.0f);
    
      2
          const3 = _mm_set_ps1(378.0f);
    
      2
          const4 = _mm_set_ps1(62370.0f);
    
      2
          const5 = _mm_set_ps1(3150.0f);
    
      2
          const6 = _mm_set_ps1(28.0f);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              x2 = _mm_mul_ps(aVal, aVal);
    
      393204
              a = _mm_mul_ps(
    
                  aVal,
    
                  _mm_add_ps(
    
                      const1,
    
                      _mm_mul_ps(x2,
    
                                 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
    
      393204
              b = _mm_add_ps(
    
                  const1,
    
                  _mm_mul_ps(
    
                      x2,
    
                      _mm_add_ps(const4,
    
                                 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
    
      65534
              cVal = _mm_div_ps(a, b);
    
              _mm_store_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, cVal, x2, a, b;
    
          __m256 const1, const2, const3, const4, const5, const6;
    
      2
          const1 = _mm256_set1_ps(135135.0f);
    
      2
          const2 = _mm256_set1_ps(17325.0f);
    
      2
          const3 = _mm256_set1_ps(378.0f);
    
      2
          const4 = _mm256_set1_ps(62370.0f);
    
      2
          const5 = _mm256_set1_ps(3150.0f);
    
      2
          const6 = _mm256_set1_ps(28.0f);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              x2 = _mm256_mul_ps(aVal, aVal);
    
      196596
              a = _mm256_mul_ps(
    
                  aVal,
    
                  _mm256_add_ps(
    
                      const1,
    
                      _mm256_mul_ps(
    
                          x2,
    
                          _mm256_add_ps(const2,
    
                                        _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
    
      196596
              b = _mm256_add_ps(
    
                  const1,
    
                  _mm256_mul_ps(
    
                      x2,
    
                      _mm256_add_ps(
    
                          const4,
    
                          _mm256_mul_ps(x2,
    
                                        _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
    
      32766
              cVal = _mm256_div_ps(a, b);
    
              _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #if LV_HAVE_AVX && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, cVal, x2, a, b;
    
          __m256 const1, const2, const3, const4, const5, const6;
    
      2
          const1 = _mm256_set1_ps(135135.0f);
    
      2
          const2 = _mm256_set1_ps(17325.0f);
    
      2
          const3 = _mm256_set1_ps(378.0f);
    
      2
          const4 = _mm256_set1_ps(62370.0f);
    
      2
          const5 = _mm256_set1_ps(3150.0f);
    
      2
          const6 = _mm256_set1_ps(28.0f);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              x2 = _mm256_mul_ps(aVal, aVal);
    
      131064
              a = _mm256_mul_ps(
    
                  aVal,
    
                  _mm256_fmadd_ps(
    
                      x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
    
      98298
              b = _mm256_fmadd_ps(
    
                  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
    
      32766
              cVal = _mm256_div_ps(a, b);
    
              _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
    
      #endif /* INCLUDED_volk_32f_tanh_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_tanh_32f_u_H
    
      #define INCLUDED_volk_32f_tanh_32f_u_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <string.h>
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      static inline void
    
      2
      volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m128 aVal, cVal, x2, a, b;
    
          __m128 const1, const2, const3, const4, const5, const6;
    
      2
          const1 = _mm_set_ps1(135135.0f);
    
      2
          const2 = _mm_set_ps1(17325.0f);
    
      2
          const3 = _mm_set_ps1(378.0f);
    
      2
          const4 = _mm_set_ps1(62370.0f);
    
      2
          const5 = _mm_set_ps1(3150.0f);
    
      2
          const6 = _mm_set_ps1(28.0f);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      65534
              x2 = _mm_mul_ps(aVal, aVal);
    
      393204
              a = _mm_mul_ps(
    
                  aVal,
    
                  _mm_add_ps(
    
                      const1,
    
                      _mm_mul_ps(x2,
    
                                 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
    
      393204
              b = _mm_add_ps(
    
                  const1,
    
                  _mm_mul_ps(
    
                      x2,
    
                      _mm_add_ps(const4,
    
                                 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
    
      65534
              cVal = _mm_div_ps(a, b);
    
              _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, cVal, x2, a, b;
    
          __m256 const1, const2, const3, const4, const5, const6;
    
      2
          const1 = _mm256_set1_ps(135135.0f);
    
      2
          const2 = _mm256_set1_ps(17325.0f);
    
      2
          const3 = _mm256_set1_ps(378.0f);
    
      2
          const4 = _mm256_set1_ps(62370.0f);
    
      2
          const5 = _mm256_set1_ps(3150.0f);
    
      2
          const6 = _mm256_set1_ps(28.0f);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              x2 = _mm256_mul_ps(aVal, aVal);
    
      196596
              a = _mm256_mul_ps(
    
                  aVal,
    
                  _mm256_add_ps(
    
                      const1,
    
                      _mm256_mul_ps(
    
                          x2,
    
                          _mm256_add_ps(const2,
    
                                        _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
    
      196596
              b = _mm256_add_ps(
    
                  const1,
    
                  _mm256_mul_ps(
    
                      x2,
    
                      _mm256_add_ps(
    
                          const4,
    
                          _mm256_mul_ps(x2,
    
                                        _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
    
      32766
              cVal = _mm256_div_ps(a, b);
    
              _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #if LV_HAVE_AVX && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, cVal, x2, a, b;
    
          __m256 const1, const2, const3, const4, const5, const6;
    
      2
          const1 = _mm256_set1_ps(135135.0f);
    
      2
          const2 = _mm256_set1_ps(17325.0f);
    
      2
          const3 = _mm256_set1_ps(378.0f);
    
      2
          const4 = _mm256_set1_ps(62370.0f);
    
      2
          const5 = _mm256_set1_ps(3150.0f);
    
      2
          const6 = _mm256_set1_ps(28.0f);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              x2 = _mm256_mul_ps(aVal, aVal);
    
      131064
              a = _mm256_mul_ps(
    
                  aVal,
    
                  _mm256_fmadd_ps(
    
                      x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
    
      98298
              b = _mm256_fmadd_ps(
    
                  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
    
      32766
              cVal = _mm256_div_ps(a, b);
    
              _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
    
      #endif /* INCLUDED_volk_32f_tanh_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_tanh_32f
12			*
13			* \b Overview
14			*
15			* Computes the hyperbolic tangent of each element of the aVector:
16			*
17			* c[i] = tanh(a[i])
18			*
19			* <b>Dispatcher Prototype</b>
20			* \code
21			* void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points)
22			* \endcode
23			*
24			* \b Inputs
25			* \li aVector: The buffer of points.
26			* \li num_points: The number of values in input buffer.
27			*
28			* \b Outputs
29			* \li cVector: The output buffer.
30			*
31			* \b Example
32			* \code
33			* int N = 10;
34			* unsigned int alignment = volk_get_alignment();
35			* float* in = (float)volk_malloc(sizeof(float)N, alignment);
36			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
37			*
38			* for(unsigned int ii = 0; ii < N; ++ii){
39			* // the approximate artanh(x) for x<1
40			* float x = (float)ii / (float)N;
41			* in[ii] = 0.5 * std::log((1.f+x)/(1.f-x));
42			* }
43			*
44			* volk_32f_tanh_32f(out, in, N);
45			*
46			* for(unsigned int ii = 0; ii < N; ++ii){
47			* printf("out(%i) = %f\n", ii, out[ii]);
48			* }
49			*
50			* volk_free(in);
51			* volk_free(out);
52			* \endcode
53			*/
54
55			#ifndef INCLUDED_volk_32f_tanh_32f_a_H
56			#define INCLUDED_volk_32f_tanh_32f_a_H
57
58			#include <inttypes.h>
59			#include <math.h>
60			#include <stdio.h>
61			#include <string.h>
62
63
64			#ifdef LV_HAVE_GENERIC
65
66			static inline void
67		2	volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
68			{
69		2	unsigned int number = 0;
70		2	float* cPtr = cVector;
71		2	const float* aPtr = aVector;
72	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; number++) {
73		262142	cPtr++ = tanhf(aPtr++);
74			}
75		2	}
76
77			#endif /* LV_HAVE_GENERIC */
78
79
80			#ifdef LV_HAVE_GENERIC
81
82			static inline void
83		14	volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
84			{
85		14	float* cPtr = cVector;
86		14	const float* aPtr = aVector;
87	2/2 ✓ Branch 0 taken 262210 times. ✓ Branch 1 taken 14 times.	262224	for (unsigned int number = 0; number < num_points; number++) {
88	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 262210 times.	262210	if (*aPtr > 4.97)
89		✗	*cPtr++ = 1;
90	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 262210 times.	262210	else if (*aPtr <= -4.97)
91		✗	*cPtr++ = -1;
92			else {
93		262210	float x2 = (aPtr) (*aPtr);
94		262210	float a = (aPtr) (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95		262210	float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
96		262210	*cPtr++ = a / b;
97		262210	aPtr++;
98			}
99			}
100		14	}
101
102			#endif /* LV_HAVE_GENERIC */
103
104
105			#ifdef LV_HAVE_SSE
106			#include <xmmintrin.h>
107
108			static inline void
109		2	volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
110			{
111		2	unsigned int number = 0;
112		2	const unsigned int quarterPoints = num_points / 4;
113
114		2	float* cPtr = cVector;
115		2	const float* aPtr = aVector;
116
117			__m128 aVal, cVal, x2, a, b;
118			__m128 const1, const2, const3, const4, const5, const6;
119		2	const1 = _mm_set_ps1(135135.0f);
120		2	const2 = _mm_set_ps1(17325.0f);
121		2	const3 = _mm_set_ps1(378.0f);
122		2	const4 = _mm_set_ps1(62370.0f);
123		2	const5 = _mm_set_ps1(3150.0f);
124		2	const6 = _mm_set_ps1(28.0f);
125	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
126
127		65534	aVal = _mm_load_ps(aPtr);
128		65534	x2 = _mm_mul_ps(aVal, aVal);
129		393204	a = _mm_mul_ps(
130			aVal,
131			_mm_add_ps(
132			const1,
133			_mm_mul_ps(x2,
134			_mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
135		393204	b = _mm_add_ps(
136			const1,
137			_mm_mul_ps(
138			x2,
139			_mm_add_ps(const4,
140			_mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
141
142		65534	cVal = _mm_div_ps(a, b);
143
144			_mm_store_ps(cPtr, cVal); // Store the results back into the C container
145
146		65534	aPtr += 4;
147		65534	cPtr += 4;
148			}
149
150		2	number = quarterPoints * 4;
151		2	volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
152		2	}
153			#endif /* LV_HAVE_SSE */
154
155
156			#ifdef LV_HAVE_AVX
157			#include <immintrin.h>
158
159			static inline void
160		2	volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
161			{
162		2	unsigned int number = 0;
163		2	const unsigned int eighthPoints = num_points / 8;
164
165		2	float* cPtr = cVector;
166		2	const float* aPtr = aVector;
167
168			__m256 aVal, cVal, x2, a, b;
169			__m256 const1, const2, const3, const4, const5, const6;
170		2	const1 = _mm256_set1_ps(135135.0f);
171		2	const2 = _mm256_set1_ps(17325.0f);
172		2	const3 = _mm256_set1_ps(378.0f);
173		2	const4 = _mm256_set1_ps(62370.0f);
174		2	const5 = _mm256_set1_ps(3150.0f);
175		2	const6 = _mm256_set1_ps(28.0f);
176	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
177
178		32766	aVal = _mm256_load_ps(aPtr);
179		32766	x2 = _mm256_mul_ps(aVal, aVal);
180		196596	a = _mm256_mul_ps(
181			aVal,
182			_mm256_add_ps(
183			const1,
184			_mm256_mul_ps(
185			x2,
186			_mm256_add_ps(const2,
187			_mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
188		196596	b = _mm256_add_ps(
189			const1,
190			_mm256_mul_ps(
191			x2,
192			_mm256_add_ps(
193			const4,
194			_mm256_mul_ps(x2,
195			_mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
196
197		32766	cVal = _mm256_div_ps(a, b);
198
199			_mm256_store_ps(cPtr, cVal); // Store the results back into the C container
200
201		32766	aPtr += 8;
202		32766	cPtr += 8;
203			}
204
205		2	number = eighthPoints * 8;
206		2	volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
207		2	}
208			#endif /* LV_HAVE_AVX */
209
210			#if LV_HAVE_AVX && LV_HAVE_FMA
211			#include <immintrin.h>
212
213			static inline void
214		2	volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
215			{
216		2	unsigned int number = 0;
217		2	const unsigned int eighthPoints = num_points / 8;
218
219		2	float* cPtr = cVector;
220		2	const float* aPtr = aVector;
221
222			__m256 aVal, cVal, x2, a, b;
223			__m256 const1, const2, const3, const4, const5, const6;
224		2	const1 = _mm256_set1_ps(135135.0f);
225		2	const2 = _mm256_set1_ps(17325.0f);
226		2	const3 = _mm256_set1_ps(378.0f);
227		2	const4 = _mm256_set1_ps(62370.0f);
228		2	const5 = _mm256_set1_ps(3150.0f);
229		2	const6 = _mm256_set1_ps(28.0f);
230	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
231
232		32766	aVal = _mm256_load_ps(aPtr);
233		32766	x2 = _mm256_mul_ps(aVal, aVal);
234		131064	a = _mm256_mul_ps(
235			aVal,
236			_mm256_fmadd_ps(
237			x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
238		98298	b = _mm256_fmadd_ps(
239			x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
240
241		32766	cVal = _mm256_div_ps(a, b);
242
243			_mm256_store_ps(cPtr, cVal); // Store the results back into the C container
244
245		32766	aPtr += 8;
246		32766	cPtr += 8;
247			}
248
249		2	number = eighthPoints * 8;
250		2	volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
251		2	}
252			#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
253
254			#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
255
256
257			#ifndef INCLUDED_volk_32f_tanh_32f_u_H
258			#define INCLUDED_volk_32f_tanh_32f_u_H
259
260			#include <inttypes.h>
261			#include <math.h>
262			#include <stdio.h>
263			#include <string.h>
264
265
266			#ifdef LV_HAVE_SSE
267			#include <xmmintrin.h>
268
269			static inline void
270		2	volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
271			{
272		2	unsigned int number = 0;
273		2	const unsigned int quarterPoints = num_points / 4;
274
275		2	float* cPtr = cVector;
276		2	const float* aPtr = aVector;
277
278			__m128 aVal, cVal, x2, a, b;
279			__m128 const1, const2, const3, const4, const5, const6;
280		2	const1 = _mm_set_ps1(135135.0f);
281		2	const2 = _mm_set_ps1(17325.0f);
282		2	const3 = _mm_set_ps1(378.0f);
283		2	const4 = _mm_set_ps1(62370.0f);
284		2	const5 = _mm_set_ps1(3150.0f);
285		2	const6 = _mm_set_ps1(28.0f);
286	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
287
288		65534	aVal = _mm_loadu_ps(aPtr);
289		65534	x2 = _mm_mul_ps(aVal, aVal);
290		393204	a = _mm_mul_ps(
291			aVal,
292			_mm_add_ps(
293			const1,
294			_mm_mul_ps(x2,
295			_mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
296		393204	b = _mm_add_ps(
297			const1,
298			_mm_mul_ps(
299			x2,
300			_mm_add_ps(const4,
301			_mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
302
303		65534	cVal = _mm_div_ps(a, b);
304
305			_mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
306
307		65534	aPtr += 4;
308		65534	cPtr += 4;
309			}
310
311		2	number = quarterPoints * 4;
312		2	volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
313		2	}
314			#endif /* LV_HAVE_SSE */
315
316
317			#ifdef LV_HAVE_AVX
318			#include <immintrin.h>
319
320			static inline void
321		2	volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
322			{
323		2	unsigned int number = 0;
324		2	const unsigned int eighthPoints = num_points / 8;
325
326		2	float* cPtr = cVector;
327		2	const float* aPtr = aVector;
328
329			__m256 aVal, cVal, x2, a, b;
330			__m256 const1, const2, const3, const4, const5, const6;
331		2	const1 = _mm256_set1_ps(135135.0f);
332		2	const2 = _mm256_set1_ps(17325.0f);
333		2	const3 = _mm256_set1_ps(378.0f);
334		2	const4 = _mm256_set1_ps(62370.0f);
335		2	const5 = _mm256_set1_ps(3150.0f);
336		2	const6 = _mm256_set1_ps(28.0f);
337	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
338
339		32766	aVal = _mm256_loadu_ps(aPtr);
340		32766	x2 = _mm256_mul_ps(aVal, aVal);
341		196596	a = _mm256_mul_ps(
342			aVal,
343			_mm256_add_ps(
344			const1,
345			_mm256_mul_ps(
346			x2,
347			_mm256_add_ps(const2,
348			_mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
349		196596	b = _mm256_add_ps(
350			const1,
351			_mm256_mul_ps(
352			x2,
353			_mm256_add_ps(
354			const4,
355			_mm256_mul_ps(x2,
356			_mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
357
358		32766	cVal = _mm256_div_ps(a, b);
359
360			_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
361
362		32766	aPtr += 8;
363		32766	cPtr += 8;
364			}
365
366		2	number = eighthPoints * 8;
367		2	volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
368		2	}
369			#endif /* LV_HAVE_AVX */
370
371			#if LV_HAVE_AVX && LV_HAVE_FMA
372			#include <immintrin.h>
373
374			static inline void
375		2	volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
376			{
377		2	unsigned int number = 0;
378		2	const unsigned int eighthPoints = num_points / 8;
379
380		2	float* cPtr = cVector;
381		2	const float* aPtr = aVector;
382
383			__m256 aVal, cVal, x2, a, b;
384			__m256 const1, const2, const3, const4, const5, const6;
385		2	const1 = _mm256_set1_ps(135135.0f);
386		2	const2 = _mm256_set1_ps(17325.0f);
387		2	const3 = _mm256_set1_ps(378.0f);
388		2	const4 = _mm256_set1_ps(62370.0f);
389		2	const5 = _mm256_set1_ps(3150.0f);
390		2	const6 = _mm256_set1_ps(28.0f);
391	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
392
393		32766	aVal = _mm256_loadu_ps(aPtr);
394		32766	x2 = _mm256_mul_ps(aVal, aVal);
395		131064	a = _mm256_mul_ps(
396			aVal,
397			_mm256_fmadd_ps(
398			x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
399		98298	b = _mm256_fmadd_ps(
400			x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
401
402		32766	cVal = _mm256_div_ps(a, b);
403
404			_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
405
406		32766	aPtr += 8;
407		32766	cPtr += 8;
408			}
409
410		2	number = eighthPoints * 8;
411		2	volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
412		2	}
413			#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
414
415			#endif /* INCLUDED_volk_32f_tanh_32f_u_H */
416