GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_expfast_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	109	109	100.0%
Functions:	7	7	100.0%
Branches:	26	26	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_expfast_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes exp of input vector and stores results in output
    
       * vector. This uses a fast exp approximation with a maximum 7% error.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
    
       * num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: Input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li bVector: The output vector.
    
       *
    
       * \b Example
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       in[ii] = std::log((float)ii);
    
       *   }
    
       *
    
       *   volk_32f_expfast_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out(%i) = %f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #define Mln2 0.6931471805f
    
      #define A 8388608.0f
    
      #define B 1065353216.0f
    
      #define C 60801.0f
    
      #ifndef INCLUDED_volk_32f_expfast_32f_a_H
    
      #define INCLUDED_volk_32f_expfast_32f_a_H
    
      #if LV_HAVE_AVX && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
    
                                                        const float* aVector,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, a, b;
    
          __m256i exp;
    
      2
          a = _mm256_set1_ps(A / Mln2);
    
      2
          b = _mm256_set1_ps(B - C);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      65532
              exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
    
      32766
              bVal = _mm256_castsi256_ps(exp);
    
              _mm256_store_ps(bPtr, bVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, a, b;
    
          __m256i exp;
    
      2
          a = _mm256_set1_ps(A / Mln2);
    
      2
          b = _mm256_set1_ps(B - C);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      98298
              exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
    
      32766
              bVal = _mm256_castsi256_ps(exp);
    
              _mm256_store_ps(bPtr, bVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
    
                                                       const float* aVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 aVal, bVal, a, b;
    
          __m128i exp;
    
      2
          a = _mm_set1_ps(A / Mln2);
    
      2
          b = _mm_set1_ps(B - C);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      196602
              exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
    
      65534
              bVal = _mm_castsi128_ps(exp);
    
              _mm_store_ps(bPtr, bVal);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_expfast_32f_u_H
    
      #define INCLUDED_volk_32f_expfast_32f_u_H
    
      #if LV_HAVE_AVX && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
    
                                                        const float* aVector,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, a, b;
    
          __m256i exp;
    
      2
          a = _mm256_set1_ps(A / Mln2);
    
      2
          b = _mm256_set1_ps(B - C);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      65532
              exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
    
      32766
              bVal = _mm256_castsi256_ps(exp);
    
              _mm256_storeu_ps(bPtr, bVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m256 aVal, bVal, a, b;
    
          __m256i exp;
    
      2
          a = _mm256_set1_ps(A / Mln2);
    
      2
          b = _mm256_set1_ps(B - C);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      98298
              exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
    
      32766
              bVal = _mm256_castsi256_ps(exp);
    
              _mm256_storeu_ps(bPtr, bVal);
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX for unaligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
    
                                                       const float* aVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 aVal, bVal, a, b;
    
          __m128i exp;
    
      2
          a = _mm_set1_ps(A / Mln2);
    
      2
          b = _mm_set1_ps(B - C);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      196602
              exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
    
      65534
              bVal = _mm_castsi128_ps(exp);
    
              _mm_storeu_ps(bPtr, bVal);
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for unaligned */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_expfast_32f_generic(float* bVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          float* bPtr = bVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *bPtr++ = expf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_expfast_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_expfast_32f
12			*
13			* \b Overview
14			*
15			* Computes exp of input vector and stores results in output
16			* vector. This uses a fast exp approximation with a maximum 7% error.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
21			* num_points) \endcode
22			*
23			* \b Inputs
24			* \li aVector: Input vector of floats.
25			* \li num_points: The number of data points.
26			*
27			* \b Outputs
28			* \li bVector: The output vector.
29			*
30			* \b Example
31			* \code
32			* int N = 10;
33			* unsigned int alignment = volk_get_alignment();
34			* float* in = (float)volk_malloc(sizeof(float)N, alignment);
35			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
36			*
37			* for(unsigned int ii = 0; ii < N; ++ii){
38			* in[ii] = std::log((float)ii);
39			* }
40			*
41			* volk_32f_expfast_32f(out, in, N);
42			*
43			* for(unsigned int ii = 0; ii < N; ++ii){
44			* printf("out(%i) = %f\n", ii, out[ii]);
45			* }
46			*
47			* volk_free(in);
48			* volk_free(out);
49			* \endcode
50			*/
51
52			#include <inttypes.h>
53			#include <math.h>
54			#include <stdio.h>
55
56			#define Mln2 0.6931471805f
57			#define A 8388608.0f
58			#define B 1065353216.0f
59			#define C 60801.0f
60
61
62			#ifndef INCLUDED_volk_32f_expfast_32f_a_H
63			#define INCLUDED_volk_32f_expfast_32f_a_H
64
65			#if LV_HAVE_AVX && LV_HAVE_FMA
66
67			#include <immintrin.h>
68
69		2	static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70			const float* aVector,
71			unsigned int num_points)
72			{
73		2	float* bPtr = bVector;
74		2	const float* aPtr = aVector;
75
76		2	unsigned int number = 0;
77		2	const unsigned int eighthPoints = num_points / 8;
78
79			__m256 aVal, bVal, a, b;
80			__m256i exp;
81		2	a = _mm256_set1_ps(A / Mln2);
82		2	b = _mm256_set1_ps(B - C);
83
84	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
85		32766	aVal = _mm256_load_ps(aPtr);
86		65532	exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87		32766	bVal = _mm256_castsi256_ps(exp);
88
89			_mm256_store_ps(bPtr, bVal);
90		32766	aPtr += 8;
91		32766	bPtr += 8;
92			}
93
94		2	number = eighthPoints * 8;
95	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
96		14	bPtr++ = expf(aPtr++);
97			}
98		2	}
99
100			#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101
102			#ifdef LV_HAVE_AVX
103
104			#include <immintrin.h>
105
106			static inline void
107		2	volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108			{
109		2	float* bPtr = bVector;
110		2	const float* aPtr = aVector;
111
112		2	unsigned int number = 0;
113		2	const unsigned int eighthPoints = num_points / 8;
114
115			__m256 aVal, bVal, a, b;
116			__m256i exp;
117		2	a = _mm256_set1_ps(A / Mln2);
118		2	b = _mm256_set1_ps(B - C);
119
120	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
121		32766	aVal = _mm256_load_ps(aPtr);
122		98298	exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123		32766	bVal = _mm256_castsi256_ps(exp);
124
125			_mm256_store_ps(bPtr, bVal);
126		32766	aPtr += 8;
127		32766	bPtr += 8;
128			}
129
130		2	number = eighthPoints * 8;
131	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
132		14	bPtr++ = expf(aPtr++);
133			}
134		2	}
135
136			#endif /* LV_HAVE_AVX for aligned */
137
138			#ifdef LV_HAVE_SSE4_1
139			#include <smmintrin.h>
140
141		2	static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142			const float* aVector,
143			unsigned int num_points)
144			{
145		2	float* bPtr = bVector;
146		2	const float* aPtr = aVector;
147
148		2	unsigned int number = 0;
149		2	const unsigned int quarterPoints = num_points / 4;
150
151			__m128 aVal, bVal, a, b;
152			__m128i exp;
153		2	a = _mm_set1_ps(A / Mln2);
154		2	b = _mm_set1_ps(B - C);
155
156	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
157		65534	aVal = _mm_load_ps(aPtr);
158		196602	exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159		65534	bVal = _mm_castsi128_ps(exp);
160
161			_mm_store_ps(bPtr, bVal);
162		65534	aPtr += 4;
163		65534	bPtr += 4;
164			}
165
166		2	number = quarterPoints * 4;
167	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
168		6	bPtr++ = expf(aPtr++);
169			}
170		2	}
171
172			#endif /* LV_HAVE_SSE4_1 for aligned */
173
174			#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175
176			#ifndef INCLUDED_volk_32f_expfast_32f_u_H
177			#define INCLUDED_volk_32f_expfast_32f_u_H
178
179			#if LV_HAVE_AVX && LV_HAVE_FMA
180			#include <immintrin.h>
181
182		2	static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183			const float* aVector,
184			unsigned int num_points)
185			{
186		2	float* bPtr = bVector;
187		2	const float* aPtr = aVector;
188
189		2	unsigned int number = 0;
190		2	const unsigned int eighthPoints = num_points / 8;
191
192			__m256 aVal, bVal, a, b;
193			__m256i exp;
194		2	a = _mm256_set1_ps(A / Mln2);
195		2	b = _mm256_set1_ps(B - C);
196
197	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
198		32766	aVal = _mm256_loadu_ps(aPtr);
199		65532	exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200		32766	bVal = _mm256_castsi256_ps(exp);
201
202			_mm256_storeu_ps(bPtr, bVal);
203		32766	aPtr += 8;
204		32766	bPtr += 8;
205			}
206
207		2	number = eighthPoints * 8;
208	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
209		14	bPtr++ = expf(aPtr++);
210			}
211		2	}
212
213			#endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214
215			#ifdef LV_HAVE_AVX
216			#include <immintrin.h>
217
218			static inline void
219		2	volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220			{
221		2	float* bPtr = bVector;
222		2	const float* aPtr = aVector;
223
224		2	unsigned int number = 0;
225		2	const unsigned int eighthPoints = num_points / 8;
226
227			__m256 aVal, bVal, a, b;
228			__m256i exp;
229		2	a = _mm256_set1_ps(A / Mln2);
230		2	b = _mm256_set1_ps(B - C);
231
232	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
233		32766	aVal = _mm256_loadu_ps(aPtr);
234		98298	exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235		32766	bVal = _mm256_castsi256_ps(exp);
236
237			_mm256_storeu_ps(bPtr, bVal);
238		32766	aPtr += 8;
239		32766	bPtr += 8;
240			}
241
242		2	number = eighthPoints * 8;
243	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
244		14	bPtr++ = expf(aPtr++);
245			}
246		2	}
247
248			#endif /* LV_HAVE_AVX for unaligned */
249
250
251			#ifdef LV_HAVE_SSE4_1
252			#include <smmintrin.h>
253
254		2	static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255			const float* aVector,
256			unsigned int num_points)
257			{
258		2	float* bPtr = bVector;
259		2	const float* aPtr = aVector;
260
261		2	unsigned int number = 0;
262		2	const unsigned int quarterPoints = num_points / 4;
263
264			__m128 aVal, bVal, a, b;
265			__m128i exp;
266		2	a = _mm_set1_ps(A / Mln2);
267		2	b = _mm_set1_ps(B - C);
268
269	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
270		65534	aVal = _mm_loadu_ps(aPtr);
271		196602	exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272		65534	bVal = _mm_castsi128_ps(exp);
273
274			_mm_storeu_ps(bPtr, bVal);
275		65534	aPtr += 4;
276		65534	bPtr += 4;
277			}
278
279		2	number = quarterPoints * 4;
280	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
281		6	bPtr++ = expf(aPtr++);
282			}
283		2	}
284
285			#endif /* LV_HAVE_SSE4_1 for unaligned */
286
287
288			#ifdef LV_HAVE_GENERIC
289
290		2	static inline void volk_32f_expfast_32f_generic(float* bVector,
291			const float* aVector,
292			unsigned int num_points)
293			{
294		2	float* bPtr = bVector;
295		2	const float* aPtr = aVector;
296		2	unsigned int number = 0;
297
298	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
299		262142	bPtr++ = expf(aPtr++);
300			}
301		2	}
302			#endif /* LV_HAVE_GENERIC */
303
304			#endif /* INCLUDED_volk_32f_expfast_32f_u_H */
305