Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2015-2020 Free Software Foundation, Inc. |
4 |
|
|
* |
5 |
|
|
* This file is part of VOLK |
6 |
|
|
* |
7 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
/* SIMD (SSE4) implementation of exp |
11 |
|
|
Inspired by Intel Approximate Math library, and based on the |
12 |
|
|
corresponding algorithms of the cephes math library |
13 |
|
|
*/ |
14 |
|
|
|
15 |
|
|
/* Copyright (C) 2007 Julien Pommier |
16 |
|
|
|
17 |
|
|
This software is provided 'as-is', without any express or implied |
18 |
|
|
warranty. In no event will the authors be held liable for any damages |
19 |
|
|
arising from the use of this software. |
20 |
|
|
|
21 |
|
|
Permission is granted to anyone to use this software for any purpose, |
22 |
|
|
including commercial applications, and to alter it and redistribute it |
23 |
|
|
freely, subject to the following restrictions: |
24 |
|
|
|
25 |
|
|
1. The origin of this software must not be misrepresented; you must not |
26 |
|
|
claim that you wrote the original software. If you use this software |
27 |
|
|
in a product, an acknowledgment in the product documentation would be |
28 |
|
|
appreciated but is not required. |
29 |
|
|
2. Altered source versions must be plainly marked as such, and must not be |
30 |
|
|
misrepresented as being the original software. |
31 |
|
|
3. This notice may not be removed or altered from any source distribution. |
32 |
|
|
|
33 |
|
|
(this is the zlib license) |
34 |
|
|
*/ |
35 |
|
|
|
36 |
|
|
/*! |
37 |
|
|
* \page volk_32f_exp_32f |
38 |
|
|
* |
39 |
|
|
* \b Overview |
40 |
|
|
* |
41 |
|
|
* Computes exponential of input vector and stores results in output vector. |
42 |
|
|
* |
43 |
|
|
* <b>Dispatcher Prototype</b> |
44 |
|
|
* \code |
45 |
|
|
* void volk_32f_exp_32f(float* bVector, const float* aVector, unsigned int num_points) |
46 |
|
|
* \endcode |
47 |
|
|
* |
48 |
|
|
* \b Inputs |
49 |
|
|
* \li aVector: The input vector of floats. |
50 |
|
|
* \li num_points: The number of data points. |
51 |
|
|
* |
52 |
|
|
* \b Outputs |
53 |
|
|
* \li bVector: The vector where results will be stored. |
54 |
|
|
* |
55 |
|
|
* \b Example |
56 |
|
|
* \code |
57 |
|
|
* int N = 10; |
58 |
|
|
* unsigned int alignment = volk_get_alignment(); |
59 |
|
|
* float* in = (float*)volk_malloc(sizeof(float)*N, alignment); |
60 |
|
|
* float* out = (float*)volk_malloc(sizeof(float)*N, alignment); |
61 |
|
|
* |
62 |
|
|
* in[0] = 0; |
63 |
|
|
* in[1] = 0.5; |
64 |
|
|
* in[2] = std::sqrt(2.f)/2.f; |
65 |
|
|
* in[3] = std::sqrt(3.f)/2.f; |
66 |
|
|
* in[4] = in[5] = 1; |
67 |
|
|
* for(unsigned int ii = 6; ii < N; ++ii){ |
68 |
|
|
* in[ii] = - in[N-ii-1]; |
69 |
|
|
* } |
70 |
|
|
* |
71 |
|
|
* volk_32f_exp_32f(out, in, N); |
72 |
|
|
* |
73 |
|
|
* for(unsigned int ii = 0; ii < N; ++ii){ |
74 |
|
|
* printf("exp(%1.3f) = %1.3f\n", in[ii], out[ii]); |
75 |
|
|
* } |
76 |
|
|
* |
77 |
|
|
* volk_free(in); |
78 |
|
|
* volk_free(out); |
79 |
|
|
* \endcode |
80 |
|
|
*/ |
81 |
|
|
|
82 |
|
|
#include <inttypes.h> |
83 |
|
|
#include <math.h> |
84 |
|
|
#include <stdio.h> |
85 |
|
|
|
86 |
|
|
#ifndef INCLUDED_volk_32f_exp_32f_a_H |
87 |
|
|
#define INCLUDED_volk_32f_exp_32f_a_H |
88 |
|
|
|
89 |
|
|
#ifdef LV_HAVE_SSE2 |
90 |
|
|
#include <emmintrin.h> |
91 |
|
|
|
92 |
|
|
static inline void |
93 |
|
✗ |
volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points) |
94 |
|
|
{ |
95 |
|
✗ |
float* bPtr = bVector; |
96 |
|
✗ |
const float* aPtr = aVector; |
97 |
|
|
|
98 |
|
✗ |
unsigned int number = 0; |
99 |
|
✗ |
unsigned int quarterPoints = num_points / 4; |
100 |
|
|
|
101 |
|
|
// Declare variables and constants |
102 |
|
|
__m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; |
103 |
|
|
__m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; |
104 |
|
|
__m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; |
105 |
|
|
__m128i emm0, pi32_0x7f; |
106 |
|
|
|
107 |
|
✗ |
one = _mm_set1_ps(1.0); |
108 |
|
✗ |
exp_hi = _mm_set1_ps(88.3762626647949); |
109 |
|
✗ |
exp_lo = _mm_set1_ps(-88.3762626647949); |
110 |
|
✗ |
log2EF = _mm_set1_ps(1.44269504088896341); |
111 |
|
✗ |
half = _mm_set1_ps(0.5); |
112 |
|
✗ |
exp_C1 = _mm_set1_ps(0.693359375); |
113 |
|
✗ |
exp_C2 = _mm_set1_ps(-2.12194440e-4); |
114 |
|
✗ |
pi32_0x7f = _mm_set1_epi32(0x7f); |
115 |
|
|
|
116 |
|
✗ |
exp_p0 = _mm_set1_ps(1.9875691500e-4); |
117 |
|
✗ |
exp_p1 = _mm_set1_ps(1.3981999507e-3); |
118 |
|
✗ |
exp_p2 = _mm_set1_ps(8.3334519073e-3); |
119 |
|
✗ |
exp_p3 = _mm_set1_ps(4.1665795894e-2); |
120 |
|
✗ |
exp_p4 = _mm_set1_ps(1.6666665459e-1); |
121 |
|
✗ |
exp_p5 = _mm_set1_ps(5.0000001201e-1); |
122 |
|
|
|
123 |
|
✗ |
for (; number < quarterPoints; number++) { |
124 |
|
✗ |
aVal = _mm_load_ps(aPtr); |
125 |
|
✗ |
tmp = _mm_setzero_ps(); |
126 |
|
|
|
127 |
|
✗ |
aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); |
128 |
|
|
|
129 |
|
|
/* express exp(x) as exp(g + n*log(2)) */ |
130 |
|
✗ |
fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); |
131 |
|
|
|
132 |
|
✗ |
emm0 = _mm_cvttps_epi32(fx); |
133 |
|
✗ |
tmp = _mm_cvtepi32_ps(emm0); |
134 |
|
|
|
135 |
|
✗ |
mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); |
136 |
|
✗ |
fx = _mm_sub_ps(tmp, mask); |
137 |
|
|
|
138 |
|
✗ |
tmp = _mm_mul_ps(fx, exp_C1); |
139 |
|
✗ |
z = _mm_mul_ps(fx, exp_C2); |
140 |
|
✗ |
aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); |
141 |
|
✗ |
z = _mm_mul_ps(aVal, aVal); |
142 |
|
|
|
143 |
|
✗ |
y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); |
144 |
|
✗ |
y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); |
145 |
|
✗ |
y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); |
146 |
|
✗ |
y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); |
147 |
|
✗ |
y = _mm_add_ps(y, one); |
148 |
|
|
|
149 |
|
✗ |
emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); |
150 |
|
|
|
151 |
|
✗ |
pow2n = _mm_castsi128_ps(emm0); |
152 |
|
✗ |
bVal = _mm_mul_ps(y, pow2n); |
153 |
|
|
|
154 |
|
|
_mm_store_ps(bPtr, bVal); |
155 |
|
✗ |
aPtr += 4; |
156 |
|
✗ |
bPtr += 4; |
157 |
|
|
} |
158 |
|
|
|
159 |
|
✗ |
number = quarterPoints * 4; |
160 |
|
✗ |
for (; number < num_points; number++) { |
161 |
|
✗ |
*bPtr++ = expf(*aPtr++); |
162 |
|
|
} |
163 |
|
✗ |
} |
164 |
|
|
|
165 |
|
|
#endif /* LV_HAVE_SSE2 for aligned */ |
166 |
|
|
|
167 |
|
|
|
168 |
|
|
#ifdef LV_HAVE_GENERIC |
169 |
|
|
|
170 |
|
|
static inline void |
171 |
|
✗ |
volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points) |
172 |
|
|
{ |
173 |
|
✗ |
float* bPtr = bVector; |
174 |
|
✗ |
const float* aPtr = aVector; |
175 |
|
✗ |
unsigned int number = 0; |
176 |
|
|
|
177 |
|
✗ |
for (number = 0; number < num_points; number++) { |
178 |
|
✗ |
*bPtr++ = expf(*aPtr++); |
179 |
|
|
} |
180 |
|
✗ |
} |
181 |
|
|
|
182 |
|
|
#endif /* LV_HAVE_GENERIC */ |
183 |
|
|
|
184 |
|
|
#endif /* INCLUDED_volk_32f_exp_32f_a_H */ |
185 |
|
|
|
186 |
|
|
#ifndef INCLUDED_volk_32f_exp_32f_u_H |
187 |
|
|
#define INCLUDED_volk_32f_exp_32f_u_H |
188 |
|
|
|
189 |
|
|
#ifdef LV_HAVE_SSE2 |
190 |
|
|
#include <emmintrin.h> |
191 |
|
|
|
192 |
|
|
static inline void |
193 |
|
✗ |
volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points) |
194 |
|
|
{ |
195 |
|
✗ |
float* bPtr = bVector; |
196 |
|
✗ |
const float* aPtr = aVector; |
197 |
|
|
|
198 |
|
✗ |
unsigned int number = 0; |
199 |
|
✗ |
unsigned int quarterPoints = num_points / 4; |
200 |
|
|
|
201 |
|
|
// Declare variables and constants |
202 |
|
|
__m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; |
203 |
|
|
__m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; |
204 |
|
|
__m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; |
205 |
|
|
__m128i emm0, pi32_0x7f; |
206 |
|
|
|
207 |
|
✗ |
one = _mm_set1_ps(1.0); |
208 |
|
✗ |
exp_hi = _mm_set1_ps(88.3762626647949); |
209 |
|
✗ |
exp_lo = _mm_set1_ps(-88.3762626647949); |
210 |
|
✗ |
log2EF = _mm_set1_ps(1.44269504088896341); |
211 |
|
✗ |
half = _mm_set1_ps(0.5); |
212 |
|
✗ |
exp_C1 = _mm_set1_ps(0.693359375); |
213 |
|
✗ |
exp_C2 = _mm_set1_ps(-2.12194440e-4); |
214 |
|
✗ |
pi32_0x7f = _mm_set1_epi32(0x7f); |
215 |
|
|
|
216 |
|
✗ |
exp_p0 = _mm_set1_ps(1.9875691500e-4); |
217 |
|
✗ |
exp_p1 = _mm_set1_ps(1.3981999507e-3); |
218 |
|
✗ |
exp_p2 = _mm_set1_ps(8.3334519073e-3); |
219 |
|
✗ |
exp_p3 = _mm_set1_ps(4.1665795894e-2); |
220 |
|
✗ |
exp_p4 = _mm_set1_ps(1.6666665459e-1); |
221 |
|
✗ |
exp_p5 = _mm_set1_ps(5.0000001201e-1); |
222 |
|
|
|
223 |
|
|
|
224 |
|
✗ |
for (; number < quarterPoints; number++) { |
225 |
|
✗ |
aVal = _mm_loadu_ps(aPtr); |
226 |
|
✗ |
tmp = _mm_setzero_ps(); |
227 |
|
|
|
228 |
|
✗ |
aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); |
229 |
|
|
|
230 |
|
|
/* express exp(x) as exp(g + n*log(2)) */ |
231 |
|
✗ |
fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); |
232 |
|
|
|
233 |
|
✗ |
emm0 = _mm_cvttps_epi32(fx); |
234 |
|
✗ |
tmp = _mm_cvtepi32_ps(emm0); |
235 |
|
|
|
236 |
|
✗ |
mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); |
237 |
|
✗ |
fx = _mm_sub_ps(tmp, mask); |
238 |
|
|
|
239 |
|
✗ |
tmp = _mm_mul_ps(fx, exp_C1); |
240 |
|
✗ |
z = _mm_mul_ps(fx, exp_C2); |
241 |
|
✗ |
aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); |
242 |
|
✗ |
z = _mm_mul_ps(aVal, aVal); |
243 |
|
|
|
244 |
|
✗ |
y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); |
245 |
|
✗ |
y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); |
246 |
|
✗ |
y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); |
247 |
|
✗ |
y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); |
248 |
|
✗ |
y = _mm_add_ps(y, one); |
249 |
|
|
|
250 |
|
✗ |
emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); |
251 |
|
|
|
252 |
|
✗ |
pow2n = _mm_castsi128_ps(emm0); |
253 |
|
✗ |
bVal = _mm_mul_ps(y, pow2n); |
254 |
|
|
|
255 |
|
|
_mm_storeu_ps(bPtr, bVal); |
256 |
|
✗ |
aPtr += 4; |
257 |
|
✗ |
bPtr += 4; |
258 |
|
|
} |
259 |
|
|
|
260 |
|
✗ |
number = quarterPoints * 4; |
261 |
|
✗ |
for (; number < num_points; number++) { |
262 |
|
✗ |
*bPtr++ = expf(*aPtr++); |
263 |
|
|
} |
264 |
|
✗ |
} |
265 |
|
|
|
266 |
|
|
#endif /* LV_HAVE_SSE2 for unaligned */ |
267 |
|
|
|
268 |
|
|
|
269 |
|
|
#ifdef LV_HAVE_GENERIC |
270 |
|
|
|
271 |
|
|
static inline void |
272 |
|
✗ |
volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) |
273 |
|
|
{ |
274 |
|
✗ |
float* bPtr = bVector; |
275 |
|
✗ |
const float* aPtr = aVector; |
276 |
|
✗ |
unsigned int number = 0; |
277 |
|
|
|
278 |
|
✗ |
for (number = 0; number < num_points; number++) { |
279 |
|
✗ |
*bPtr++ = expf(*aPtr++); |
280 |
|
|
} |
281 |
|
✗ |
} |
282 |
|
|
|
283 |
|
|
#endif /* LV_HAVE_GENERIC */ |
284 |
|
|
|
285 |
|
|
#endif /* INCLUDED_volk_32f_exp_32f_u_H */ |
286 |
|
|
|