Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_expfast_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes exp of input vector and stores results in output | ||
16 | * vector. This uses a fast exp approximation with a maximum 7% error. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int | ||
21 | * num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li aVector: Input vector of floats. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li bVector: The output vector. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10; | ||
33 | * unsigned int alignment = volk_get_alignment(); | ||
34 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
35 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | * | ||
37 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
38 | * in[ii] = std::log((float)ii); | ||
39 | * } | ||
40 | * | ||
41 | * volk_32f_expfast_32f(out, in, N); | ||
42 | * | ||
43 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
44 | * printf("out(%i) = %f\n", ii, out[ii]); | ||
45 | * } | ||
46 | * | ||
47 | * volk_free(in); | ||
48 | * volk_free(out); | ||
49 | * \endcode | ||
50 | */ | ||
51 | |||
52 | #include <inttypes.h> | ||
53 | #include <math.h> | ||
54 | #include <stdio.h> | ||
55 | |||
56 | #define Mln2 0.6931471805f | ||
57 | #define A 8388608.0f | ||
58 | #define B 1065353216.0f | ||
59 | #define C 60801.0f | ||
60 | |||
61 | |||
62 | #ifndef INCLUDED_volk_32f_expfast_32f_a_H | ||
63 | #define INCLUDED_volk_32f_expfast_32f_a_H | ||
64 | |||
65 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
66 | |||
67 | #include <immintrin.h> | ||
68 | |||
69 | 2 | static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector, | |
70 | const float* aVector, | ||
71 | unsigned int num_points) | ||
72 | { | ||
73 | 2 | float* bPtr = bVector; | |
74 | 2 | const float* aPtr = aVector; | |
75 | |||
76 | 2 | unsigned int number = 0; | |
77 | 2 | const unsigned int eighthPoints = num_points / 8; | |
78 | |||
79 | __m256 aVal, bVal, a, b; | ||
80 | __m256i exp; | ||
81 | 2 | a = _mm256_set1_ps(A / Mln2); | |
82 | 2 | b = _mm256_set1_ps(B - C); | |
83 | |||
84 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
85 | 32766 | aVal = _mm256_load_ps(aPtr); | |
86 | 65532 | exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b)); | |
87 | 32766 | bVal = _mm256_castsi256_ps(exp); | |
88 | |||
89 | _mm256_store_ps(bPtr, bVal); | ||
90 | 32766 | aPtr += 8; | |
91 | 32766 | bPtr += 8; | |
92 | } | ||
93 | |||
94 | 2 | number = eighthPoints * 8; | |
95 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
96 | 14 | *bPtr++ = expf(*aPtr++); | |
97 | } | ||
98 | 2 | } | |
99 | |||
100 | #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */ | ||
101 | |||
102 | #ifdef LV_HAVE_AVX | ||
103 | |||
104 | #include <immintrin.h> | ||
105 | |||
106 | static inline void | ||
107 | 2 | volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) | |
108 | { | ||
109 | 2 | float* bPtr = bVector; | |
110 | 2 | const float* aPtr = aVector; | |
111 | |||
112 | 2 | unsigned int number = 0; | |
113 | 2 | const unsigned int eighthPoints = num_points / 8; | |
114 | |||
115 | __m256 aVal, bVal, a, b; | ||
116 | __m256i exp; | ||
117 | 2 | a = _mm256_set1_ps(A / Mln2); | |
118 | 2 | b = _mm256_set1_ps(B - C); | |
119 | |||
120 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
121 | 32766 | aVal = _mm256_load_ps(aPtr); | |
122 | 98298 | exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b)); | |
123 | 32766 | bVal = _mm256_castsi256_ps(exp); | |
124 | |||
125 | _mm256_store_ps(bPtr, bVal); | ||
126 | 32766 | aPtr += 8; | |
127 | 32766 | bPtr += 8; | |
128 | } | ||
129 | |||
130 | 2 | number = eighthPoints * 8; | |
131 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
132 | 14 | *bPtr++ = expf(*aPtr++); | |
133 | } | ||
134 | 2 | } | |
135 | |||
136 | #endif /* LV_HAVE_AVX for aligned */ | ||
137 | |||
138 | #ifdef LV_HAVE_SSE4_1 | ||
139 | #include <smmintrin.h> | ||
140 | |||
141 | 2 | static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector, | |
142 | const float* aVector, | ||
143 | unsigned int num_points) | ||
144 | { | ||
145 | 2 | float* bPtr = bVector; | |
146 | 2 | const float* aPtr = aVector; | |
147 | |||
148 | 2 | unsigned int number = 0; | |
149 | 2 | const unsigned int quarterPoints = num_points / 4; | |
150 | |||
151 | __m128 aVal, bVal, a, b; | ||
152 | __m128i exp; | ||
153 | 2 | a = _mm_set1_ps(A / Mln2); | |
154 | 2 | b = _mm_set1_ps(B - C); | |
155 | |||
156 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
157 | 65534 | aVal = _mm_load_ps(aPtr); | |
158 | 196602 | exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b)); | |
159 | 65534 | bVal = _mm_castsi128_ps(exp); | |
160 | |||
161 | _mm_store_ps(bPtr, bVal); | ||
162 | 65534 | aPtr += 4; | |
163 | 65534 | bPtr += 4; | |
164 | } | ||
165 | |||
166 | 2 | number = quarterPoints * 4; | |
167 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
168 | 6 | *bPtr++ = expf(*aPtr++); | |
169 | } | ||
170 | 2 | } | |
171 | |||
172 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
173 | |||
174 | #endif /* INCLUDED_volk_32f_expfast_32f_a_H */ | ||
175 | |||
176 | #ifndef INCLUDED_volk_32f_expfast_32f_u_H | ||
177 | #define INCLUDED_volk_32f_expfast_32f_u_H | ||
178 | |||
179 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
180 | #include <immintrin.h> | ||
181 | |||
182 | 2 | static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector, | |
183 | const float* aVector, | ||
184 | unsigned int num_points) | ||
185 | { | ||
186 | 2 | float* bPtr = bVector; | |
187 | 2 | const float* aPtr = aVector; | |
188 | |||
189 | 2 | unsigned int number = 0; | |
190 | 2 | const unsigned int eighthPoints = num_points / 8; | |
191 | |||
192 | __m256 aVal, bVal, a, b; | ||
193 | __m256i exp; | ||
194 | 2 | a = _mm256_set1_ps(A / Mln2); | |
195 | 2 | b = _mm256_set1_ps(B - C); | |
196 | |||
197 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
198 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
199 | 65532 | exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b)); | |
200 | 32766 | bVal = _mm256_castsi256_ps(exp); | |
201 | |||
202 | _mm256_storeu_ps(bPtr, bVal); | ||
203 | 32766 | aPtr += 8; | |
204 | 32766 | bPtr += 8; | |
205 | } | ||
206 | |||
207 | 2 | number = eighthPoints * 8; | |
208 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
209 | 14 | *bPtr++ = expf(*aPtr++); | |
210 | } | ||
211 | 2 | } | |
212 | |||
213 | #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */ | ||
214 | |||
215 | #ifdef LV_HAVE_AVX | ||
216 | #include <immintrin.h> | ||
217 | |||
218 | static inline void | ||
219 | 2 | volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) | |
220 | { | ||
221 | 2 | float* bPtr = bVector; | |
222 | 2 | const float* aPtr = aVector; | |
223 | |||
224 | 2 | unsigned int number = 0; | |
225 | 2 | const unsigned int eighthPoints = num_points / 8; | |
226 | |||
227 | __m256 aVal, bVal, a, b; | ||
228 | __m256i exp; | ||
229 | 2 | a = _mm256_set1_ps(A / Mln2); | |
230 | 2 | b = _mm256_set1_ps(B - C); | |
231 | |||
232 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
233 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
234 | 98298 | exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b)); | |
235 | 32766 | bVal = _mm256_castsi256_ps(exp); | |
236 | |||
237 | _mm256_storeu_ps(bPtr, bVal); | ||
238 | 32766 | aPtr += 8; | |
239 | 32766 | bPtr += 8; | |
240 | } | ||
241 | |||
242 | 2 | number = eighthPoints * 8; | |
243 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
244 | 14 | *bPtr++ = expf(*aPtr++); | |
245 | } | ||
246 | 2 | } | |
247 | |||
248 | #endif /* LV_HAVE_AVX for unaligned */ | ||
249 | |||
250 | |||
251 | #ifdef LV_HAVE_SSE4_1 | ||
252 | #include <smmintrin.h> | ||
253 | |||
254 | 2 | static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector, | |
255 | const float* aVector, | ||
256 | unsigned int num_points) | ||
257 | { | ||
258 | 2 | float* bPtr = bVector; | |
259 | 2 | const float* aPtr = aVector; | |
260 | |||
261 | 2 | unsigned int number = 0; | |
262 | 2 | const unsigned int quarterPoints = num_points / 4; | |
263 | |||
264 | __m128 aVal, bVal, a, b; | ||
265 | __m128i exp; | ||
266 | 2 | a = _mm_set1_ps(A / Mln2); | |
267 | 2 | b = _mm_set1_ps(B - C); | |
268 | |||
269 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
270 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
271 | 196602 | exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b)); | |
272 | 65534 | bVal = _mm_castsi128_ps(exp); | |
273 | |||
274 | _mm_storeu_ps(bPtr, bVal); | ||
275 | 65534 | aPtr += 4; | |
276 | 65534 | bPtr += 4; | |
277 | } | ||
278 | |||
279 | 2 | number = quarterPoints * 4; | |
280 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
281 | 6 | *bPtr++ = expf(*aPtr++); | |
282 | } | ||
283 | 2 | } | |
284 | |||
285 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
286 | |||
287 | |||
288 | #ifdef LV_HAVE_GENERIC | ||
289 | |||
290 | 2 | static inline void volk_32f_expfast_32f_generic(float* bVector, | |
291 | const float* aVector, | ||
292 | unsigned int num_points) | ||
293 | { | ||
294 | 2 | float* bPtr = bVector; | |
295 | 2 | const float* aPtr = aVector; | |
296 | 2 | unsigned int number = 0; | |
297 | |||
298 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
299 | 262142 | *bPtr++ = expf(*aPtr++); | |
300 | } | ||
301 | 2 | } | |
302 | #endif /* LV_HAVE_GENERIC */ | ||
303 | |||
304 | #endif /* INCLUDED_volk_32f_expfast_32f_u_H */ | ||
305 |