GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_expfast_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 109 109 100.0%
Functions: 7 7 100.0%
Branches: 26 26 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_expfast_32f
12 *
13 * \b Overview
14 *
15 * Computes exp of input vector and stores results in output
16 * vector. This uses a fast exp approximation with a maximum 7% error.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
21 * num_points) \endcode
22 *
23 * \b Inputs
24 * \li aVector: Input vector of floats.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li bVector: The output vector.
29 *
30 * \b Example
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 * for(unsigned int ii = 0; ii < N; ++ii){
38 * in[ii] = std::log((float)ii);
39 * }
40 *
41 * volk_32f_expfast_32f(out, in, N);
42 *
43 * for(unsigned int ii = 0; ii < N; ++ii){
44 * printf("out(%i) = %f\n", ii, out[ii]);
45 * }
46 *
47 * volk_free(in);
48 * volk_free(out);
49 * \endcode
50 */
51
52 #include <inttypes.h>
53 #include <math.h>
54 #include <stdio.h>
55
56 #define Mln2 0.6931471805f
57 #define A 8388608.0f
58 #define B 1065353216.0f
59 #define C 60801.0f
60
61
62 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
63 #define INCLUDED_volk_32f_expfast_32f_a_H
64
65 #if LV_HAVE_AVX && LV_HAVE_FMA
66
67 #include <immintrin.h>
68
69 2 static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70 const float* aVector,
71 unsigned int num_points)
72 {
73 2 float* bPtr = bVector;
74 2 const float* aPtr = aVector;
75
76 2 unsigned int number = 0;
77 2 const unsigned int eighthPoints = num_points / 8;
78
79 __m256 aVal, bVal, a, b;
80 __m256i exp;
81 2 a = _mm256_set1_ps(A / Mln2);
82 2 b = _mm256_set1_ps(B - C);
83
84
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
85 32766 aVal = _mm256_load_ps(aPtr);
86 65532 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87 32766 bVal = _mm256_castsi256_ps(exp);
88
89 _mm256_store_ps(bPtr, bVal);
90 32766 aPtr += 8;
91 32766 bPtr += 8;
92 }
93
94 2 number = eighthPoints * 8;
95
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
96 14 *bPtr++ = expf(*aPtr++);
97 }
98 2 }
99
100 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101
102 #ifdef LV_HAVE_AVX
103
104 #include <immintrin.h>
105
106 static inline void
107 2 volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108 {
109 2 float* bPtr = bVector;
110 2 const float* aPtr = aVector;
111
112 2 unsigned int number = 0;
113 2 const unsigned int eighthPoints = num_points / 8;
114
115 __m256 aVal, bVal, a, b;
116 __m256i exp;
117 2 a = _mm256_set1_ps(A / Mln2);
118 2 b = _mm256_set1_ps(B - C);
119
120
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
121 32766 aVal = _mm256_load_ps(aPtr);
122 98298 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123 32766 bVal = _mm256_castsi256_ps(exp);
124
125 _mm256_store_ps(bPtr, bVal);
126 32766 aPtr += 8;
127 32766 bPtr += 8;
128 }
129
130 2 number = eighthPoints * 8;
131
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
132 14 *bPtr++ = expf(*aPtr++);
133 }
134 2 }
135
136 #endif /* LV_HAVE_AVX for aligned */
137
138 #ifdef LV_HAVE_SSE4_1
139 #include <smmintrin.h>
140
141 2 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142 const float* aVector,
143 unsigned int num_points)
144 {
145 2 float* bPtr = bVector;
146 2 const float* aPtr = aVector;
147
148 2 unsigned int number = 0;
149 2 const unsigned int quarterPoints = num_points / 4;
150
151 __m128 aVal, bVal, a, b;
152 __m128i exp;
153 2 a = _mm_set1_ps(A / Mln2);
154 2 b = _mm_set1_ps(B - C);
155
156
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
157 65534 aVal = _mm_load_ps(aPtr);
158 196602 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159 65534 bVal = _mm_castsi128_ps(exp);
160
161 _mm_store_ps(bPtr, bVal);
162 65534 aPtr += 4;
163 65534 bPtr += 4;
164 }
165
166 2 number = quarterPoints * 4;
167
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
168 6 *bPtr++ = expf(*aPtr++);
169 }
170 2 }
171
172 #endif /* LV_HAVE_SSE4_1 for aligned */
173
174 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175
176 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
177 #define INCLUDED_volk_32f_expfast_32f_u_H
178
179 #if LV_HAVE_AVX && LV_HAVE_FMA
180 #include <immintrin.h>
181
182 2 static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183 const float* aVector,
184 unsigned int num_points)
185 {
186 2 float* bPtr = bVector;
187 2 const float* aPtr = aVector;
188
189 2 unsigned int number = 0;
190 2 const unsigned int eighthPoints = num_points / 8;
191
192 __m256 aVal, bVal, a, b;
193 __m256i exp;
194 2 a = _mm256_set1_ps(A / Mln2);
195 2 b = _mm256_set1_ps(B - C);
196
197
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
198 32766 aVal = _mm256_loadu_ps(aPtr);
199 65532 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200 32766 bVal = _mm256_castsi256_ps(exp);
201
202 _mm256_storeu_ps(bPtr, bVal);
203 32766 aPtr += 8;
204 32766 bPtr += 8;
205 }
206
207 2 number = eighthPoints * 8;
208
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
209 14 *bPtr++ = expf(*aPtr++);
210 }
211 2 }
212
213 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214
215 #ifdef LV_HAVE_AVX
216 #include <immintrin.h>
217
218 static inline void
219 2 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220 {
221 2 float* bPtr = bVector;
222 2 const float* aPtr = aVector;
223
224 2 unsigned int number = 0;
225 2 const unsigned int eighthPoints = num_points / 8;
226
227 __m256 aVal, bVal, a, b;
228 __m256i exp;
229 2 a = _mm256_set1_ps(A / Mln2);
230 2 b = _mm256_set1_ps(B - C);
231
232
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
233 32766 aVal = _mm256_loadu_ps(aPtr);
234 98298 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235 32766 bVal = _mm256_castsi256_ps(exp);
236
237 _mm256_storeu_ps(bPtr, bVal);
238 32766 aPtr += 8;
239 32766 bPtr += 8;
240 }
241
242 2 number = eighthPoints * 8;
243
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
244 14 *bPtr++ = expf(*aPtr++);
245 }
246 2 }
247
248 #endif /* LV_HAVE_AVX for unaligned */
249
250
251 #ifdef LV_HAVE_SSE4_1
252 #include <smmintrin.h>
253
254 2 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255 const float* aVector,
256 unsigned int num_points)
257 {
258 2 float* bPtr = bVector;
259 2 const float* aPtr = aVector;
260
261 2 unsigned int number = 0;
262 2 const unsigned int quarterPoints = num_points / 4;
263
264 __m128 aVal, bVal, a, b;
265 __m128i exp;
266 2 a = _mm_set1_ps(A / Mln2);
267 2 b = _mm_set1_ps(B - C);
268
269
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
270 65534 aVal = _mm_loadu_ps(aPtr);
271 196602 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272 65534 bVal = _mm_castsi128_ps(exp);
273
274 _mm_storeu_ps(bPtr, bVal);
275 65534 aPtr += 4;
276 65534 bPtr += 4;
277 }
278
279 2 number = quarterPoints * 4;
280
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
281 6 *bPtr++ = expf(*aPtr++);
282 }
283 2 }
284
285 #endif /* LV_HAVE_SSE4_1 for unaligned */
286
287
288 #ifdef LV_HAVE_GENERIC
289
290 2 static inline void volk_32f_expfast_32f_generic(float* bVector,
291 const float* aVector,
292 unsigned int num_points)
293 {
294 2 float* bPtr = bVector;
295 2 const float* aPtr = aVector;
296 2 unsigned int number = 0;
297
298
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
299 262142 *bPtr++ = expf(*aPtr++);
300 }
301 2 }
302 #endif /* LV_HAVE_GENERIC */
303
304 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */
305