Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_tanh_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the hyperbolic tangent of each element of the aVector: | ||
16 | * | ||
17 | * c[i] = tanh(a[i]) | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points) | ||
22 | * \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li aVector: The buffer of points. | ||
26 | * \li num_points: The number of values in input buffer. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li cVector: The output buffer. | ||
30 | * | ||
31 | * \b Example | ||
32 | * \code | ||
33 | * int N = 10; | ||
34 | * unsigned int alignment = volk_get_alignment(); | ||
35 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
37 | * | ||
38 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
39 | * // the approximate artanh(x) for x<1 | ||
40 | * float x = (float)ii / (float)N; | ||
41 | * in[ii] = 0.5 * std::log((1.f+x)/(1.f-x)); | ||
42 | * } | ||
43 | * | ||
44 | * volk_32f_tanh_32f(out, in, N); | ||
45 | * | ||
46 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
47 | * printf("out(%i) = %f\n", ii, out[ii]); | ||
48 | * } | ||
49 | * | ||
50 | * volk_free(in); | ||
51 | * volk_free(out); | ||
52 | * \endcode | ||
53 | */ | ||
54 | |||
55 | #ifndef INCLUDED_volk_32f_tanh_32f_a_H | ||
56 | #define INCLUDED_volk_32f_tanh_32f_a_H | ||
57 | |||
58 | #include <inttypes.h> | ||
59 | #include <math.h> | ||
60 | #include <stdio.h> | ||
61 | #include <string.h> | ||
62 | |||
63 | |||
64 | #ifdef LV_HAVE_GENERIC | ||
65 | |||
66 | static inline void | ||
67 | 2 | volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points) | |
68 | { | ||
69 | 2 | unsigned int number = 0; | |
70 | 2 | float* cPtr = cVector; | |
71 | 2 | const float* aPtr = aVector; | |
72 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
73 | 262142 | *cPtr++ = tanhf(*aPtr++); | |
74 | } | ||
75 | 2 | } | |
76 | |||
77 | #endif /* LV_HAVE_GENERIC */ | ||
78 | |||
79 | |||
80 | #ifdef LV_HAVE_GENERIC | ||
81 | |||
82 | static inline void | ||
83 | 14 | volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points) | |
84 | { | ||
85 | 14 | float* cPtr = cVector; | |
86 | 14 | const float* aPtr = aVector; | |
87 |
2/2✓ Branch 0 taken 262210 times.
✓ Branch 1 taken 14 times.
|
262224 | for (unsigned int number = 0; number < num_points; number++) { |
88 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262210 times.
|
262210 | if (*aPtr > 4.97) |
89 | ✗ | *cPtr++ = 1; | |
90 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262210 times.
|
262210 | else if (*aPtr <= -4.97) |
91 | ✗ | *cPtr++ = -1; | |
92 | else { | ||
93 | 262210 | float x2 = (*aPtr) * (*aPtr); | |
94 | 262210 | float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); | |
95 | 262210 | float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); | |
96 | 262210 | *cPtr++ = a / b; | |
97 | 262210 | aPtr++; | |
98 | } | ||
99 | } | ||
100 | 14 | } | |
101 | |||
102 | #endif /* LV_HAVE_GENERIC */ | ||
103 | |||
104 | |||
105 | #ifdef LV_HAVE_SSE | ||
106 | #include <xmmintrin.h> | ||
107 | |||
108 | static inline void | ||
109 | 2 | volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) | |
110 | { | ||
111 | 2 | unsigned int number = 0; | |
112 | 2 | const unsigned int quarterPoints = num_points / 4; | |
113 | |||
114 | 2 | float* cPtr = cVector; | |
115 | 2 | const float* aPtr = aVector; | |
116 | |||
117 | __m128 aVal, cVal, x2, a, b; | ||
118 | __m128 const1, const2, const3, const4, const5, const6; | ||
119 | 2 | const1 = _mm_set_ps1(135135.0f); | |
120 | 2 | const2 = _mm_set_ps1(17325.0f); | |
121 | 2 | const3 = _mm_set_ps1(378.0f); | |
122 | 2 | const4 = _mm_set_ps1(62370.0f); | |
123 | 2 | const5 = _mm_set_ps1(3150.0f); | |
124 | 2 | const6 = _mm_set_ps1(28.0f); | |
125 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
126 | |||
127 | 65534 | aVal = _mm_load_ps(aPtr); | |
128 | 65534 | x2 = _mm_mul_ps(aVal, aVal); | |
129 | 393204 | a = _mm_mul_ps( | |
130 | aVal, | ||
131 | _mm_add_ps( | ||
132 | const1, | ||
133 | _mm_mul_ps(x2, | ||
134 | _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); | ||
135 | 393204 | b = _mm_add_ps( | |
136 | const1, | ||
137 | _mm_mul_ps( | ||
138 | x2, | ||
139 | _mm_add_ps(const4, | ||
140 | _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); | ||
141 | |||
142 | 65534 | cVal = _mm_div_ps(a, b); | |
143 | |||
144 | _mm_store_ps(cPtr, cVal); // Store the results back into the C container | ||
145 | |||
146 | 65534 | aPtr += 4; | |
147 | 65534 | cPtr += 4; | |
148 | } | ||
149 | |||
150 | 2 | number = quarterPoints * 4; | |
151 | 2 | volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number); | |
152 | 2 | } | |
153 | #endif /* LV_HAVE_SSE */ | ||
154 | |||
155 | |||
156 | #ifdef LV_HAVE_AVX | ||
157 | #include <immintrin.h> | ||
158 | |||
159 | static inline void | ||
160 | 2 | volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) | |
161 | { | ||
162 | 2 | unsigned int number = 0; | |
163 | 2 | const unsigned int eighthPoints = num_points / 8; | |
164 | |||
165 | 2 | float* cPtr = cVector; | |
166 | 2 | const float* aPtr = aVector; | |
167 | |||
168 | __m256 aVal, cVal, x2, a, b; | ||
169 | __m256 const1, const2, const3, const4, const5, const6; | ||
170 | 2 | const1 = _mm256_set1_ps(135135.0f); | |
171 | 2 | const2 = _mm256_set1_ps(17325.0f); | |
172 | 2 | const3 = _mm256_set1_ps(378.0f); | |
173 | 2 | const4 = _mm256_set1_ps(62370.0f); | |
174 | 2 | const5 = _mm256_set1_ps(3150.0f); | |
175 | 2 | const6 = _mm256_set1_ps(28.0f); | |
176 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
177 | |||
178 | 32766 | aVal = _mm256_load_ps(aPtr); | |
179 | 32766 | x2 = _mm256_mul_ps(aVal, aVal); | |
180 | 196596 | a = _mm256_mul_ps( | |
181 | aVal, | ||
182 | _mm256_add_ps( | ||
183 | const1, | ||
184 | _mm256_mul_ps( | ||
185 | x2, | ||
186 | _mm256_add_ps(const2, | ||
187 | _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); | ||
188 | 196596 | b = _mm256_add_ps( | |
189 | const1, | ||
190 | _mm256_mul_ps( | ||
191 | x2, | ||
192 | _mm256_add_ps( | ||
193 | const4, | ||
194 | _mm256_mul_ps(x2, | ||
195 | _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); | ||
196 | |||
197 | 32766 | cVal = _mm256_div_ps(a, b); | |
198 | |||
199 | _mm256_store_ps(cPtr, cVal); // Store the results back into the C container | ||
200 | |||
201 | 32766 | aPtr += 8; | |
202 | 32766 | cPtr += 8; | |
203 | } | ||
204 | |||
205 | 2 | number = eighthPoints * 8; | |
206 | 2 | volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number); | |
207 | 2 | } | |
208 | #endif /* LV_HAVE_AVX */ | ||
209 | |||
210 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
211 | #include <immintrin.h> | ||
212 | |||
213 | static inline void | ||
214 | 2 | volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points) | |
215 | { | ||
216 | 2 | unsigned int number = 0; | |
217 | 2 | const unsigned int eighthPoints = num_points / 8; | |
218 | |||
219 | 2 | float* cPtr = cVector; | |
220 | 2 | const float* aPtr = aVector; | |
221 | |||
222 | __m256 aVal, cVal, x2, a, b; | ||
223 | __m256 const1, const2, const3, const4, const5, const6; | ||
224 | 2 | const1 = _mm256_set1_ps(135135.0f); | |
225 | 2 | const2 = _mm256_set1_ps(17325.0f); | |
226 | 2 | const3 = _mm256_set1_ps(378.0f); | |
227 | 2 | const4 = _mm256_set1_ps(62370.0f); | |
228 | 2 | const5 = _mm256_set1_ps(3150.0f); | |
229 | 2 | const6 = _mm256_set1_ps(28.0f); | |
230 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
231 | |||
232 | 32766 | aVal = _mm256_load_ps(aPtr); | |
233 | 32766 | x2 = _mm256_mul_ps(aVal, aVal); | |
234 | 131064 | a = _mm256_mul_ps( | |
235 | aVal, | ||
236 | _mm256_fmadd_ps( | ||
237 | x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1)); | ||
238 | 98298 | b = _mm256_fmadd_ps( | |
239 | x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); | ||
240 | |||
241 | 32766 | cVal = _mm256_div_ps(a, b); | |
242 | |||
243 | _mm256_store_ps(cPtr, cVal); // Store the results back into the C container | ||
244 | |||
245 | 32766 | aPtr += 8; | |
246 | 32766 | cPtr += 8; | |
247 | } | ||
248 | |||
249 | 2 | number = eighthPoints * 8; | |
250 | 2 | volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number); | |
251 | 2 | } | |
252 | #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ | ||
253 | |||
254 | #endif /* INCLUDED_volk_32f_tanh_32f_a_H */ | ||
255 | |||
256 | |||
257 | #ifndef INCLUDED_volk_32f_tanh_32f_u_H | ||
258 | #define INCLUDED_volk_32f_tanh_32f_u_H | ||
259 | |||
260 | #include <inttypes.h> | ||
261 | #include <math.h> | ||
262 | #include <stdio.h> | ||
263 | #include <string.h> | ||
264 | |||
265 | |||
266 | #ifdef LV_HAVE_SSE | ||
267 | #include <xmmintrin.h> | ||
268 | |||
269 | static inline void | ||
270 | 2 | volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points) | |
271 | { | ||
272 | 2 | unsigned int number = 0; | |
273 | 2 | const unsigned int quarterPoints = num_points / 4; | |
274 | |||
275 | 2 | float* cPtr = cVector; | |
276 | 2 | const float* aPtr = aVector; | |
277 | |||
278 | __m128 aVal, cVal, x2, a, b; | ||
279 | __m128 const1, const2, const3, const4, const5, const6; | ||
280 | 2 | const1 = _mm_set_ps1(135135.0f); | |
281 | 2 | const2 = _mm_set_ps1(17325.0f); | |
282 | 2 | const3 = _mm_set_ps1(378.0f); | |
283 | 2 | const4 = _mm_set_ps1(62370.0f); | |
284 | 2 | const5 = _mm_set_ps1(3150.0f); | |
285 | 2 | const6 = _mm_set_ps1(28.0f); | |
286 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
287 | |||
288 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
289 | 65534 | x2 = _mm_mul_ps(aVal, aVal); | |
290 | 393204 | a = _mm_mul_ps( | |
291 | aVal, | ||
292 | _mm_add_ps( | ||
293 | const1, | ||
294 | _mm_mul_ps(x2, | ||
295 | _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); | ||
296 | 393204 | b = _mm_add_ps( | |
297 | const1, | ||
298 | _mm_mul_ps( | ||
299 | x2, | ||
300 | _mm_add_ps(const4, | ||
301 | _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); | ||
302 | |||
303 | 65534 | cVal = _mm_div_ps(a, b); | |
304 | |||
305 | _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
306 | |||
307 | 65534 | aPtr += 4; | |
308 | 65534 | cPtr += 4; | |
309 | } | ||
310 | |||
311 | 2 | number = quarterPoints * 4; | |
312 | 2 | volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number); | |
313 | 2 | } | |
314 | #endif /* LV_HAVE_SSE */ | ||
315 | |||
316 | |||
317 | #ifdef LV_HAVE_AVX | ||
318 | #include <immintrin.h> | ||
319 | |||
320 | static inline void | ||
321 | 2 | volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) | |
322 | { | ||
323 | 2 | unsigned int number = 0; | |
324 | 2 | const unsigned int eighthPoints = num_points / 8; | |
325 | |||
326 | 2 | float* cPtr = cVector; | |
327 | 2 | const float* aPtr = aVector; | |
328 | |||
329 | __m256 aVal, cVal, x2, a, b; | ||
330 | __m256 const1, const2, const3, const4, const5, const6; | ||
331 | 2 | const1 = _mm256_set1_ps(135135.0f); | |
332 | 2 | const2 = _mm256_set1_ps(17325.0f); | |
333 | 2 | const3 = _mm256_set1_ps(378.0f); | |
334 | 2 | const4 = _mm256_set1_ps(62370.0f); | |
335 | 2 | const5 = _mm256_set1_ps(3150.0f); | |
336 | 2 | const6 = _mm256_set1_ps(28.0f); | |
337 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
338 | |||
339 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
340 | 32766 | x2 = _mm256_mul_ps(aVal, aVal); | |
341 | 196596 | a = _mm256_mul_ps( | |
342 | aVal, | ||
343 | _mm256_add_ps( | ||
344 | const1, | ||
345 | _mm256_mul_ps( | ||
346 | x2, | ||
347 | _mm256_add_ps(const2, | ||
348 | _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); | ||
349 | 196596 | b = _mm256_add_ps( | |
350 | const1, | ||
351 | _mm256_mul_ps( | ||
352 | x2, | ||
353 | _mm256_add_ps( | ||
354 | const4, | ||
355 | _mm256_mul_ps(x2, | ||
356 | _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); | ||
357 | |||
358 | 32766 | cVal = _mm256_div_ps(a, b); | |
359 | |||
360 | _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
361 | |||
362 | 32766 | aPtr += 8; | |
363 | 32766 | cPtr += 8; | |
364 | } | ||
365 | |||
366 | 2 | number = eighthPoints * 8; | |
367 | 2 | volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number); | |
368 | 2 | } | |
369 | #endif /* LV_HAVE_AVX */ | ||
370 | |||
371 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
372 | #include <immintrin.h> | ||
373 | |||
374 | static inline void | ||
375 | 2 | volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points) | |
376 | { | ||
377 | 2 | unsigned int number = 0; | |
378 | 2 | const unsigned int eighthPoints = num_points / 8; | |
379 | |||
380 | 2 | float* cPtr = cVector; | |
381 | 2 | const float* aPtr = aVector; | |
382 | |||
383 | __m256 aVal, cVal, x2, a, b; | ||
384 | __m256 const1, const2, const3, const4, const5, const6; | ||
385 | 2 | const1 = _mm256_set1_ps(135135.0f); | |
386 | 2 | const2 = _mm256_set1_ps(17325.0f); | |
387 | 2 | const3 = _mm256_set1_ps(378.0f); | |
388 | 2 | const4 = _mm256_set1_ps(62370.0f); | |
389 | 2 | const5 = _mm256_set1_ps(3150.0f); | |
390 | 2 | const6 = _mm256_set1_ps(28.0f); | |
391 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
392 | |||
393 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
394 | 32766 | x2 = _mm256_mul_ps(aVal, aVal); | |
395 | 131064 | a = _mm256_mul_ps( | |
396 | aVal, | ||
397 | _mm256_fmadd_ps( | ||
398 | x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1)); | ||
399 | 98298 | b = _mm256_fmadd_ps( | |
400 | x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); | ||
401 | |||
402 | 32766 | cVal = _mm256_div_ps(a, b); | |
403 | |||
404 | _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
405 | |||
406 | 32766 | aPtr += 8; | |
407 | 32766 | cPtr += 8; | |
408 | } | ||
409 | |||
410 | 2 | number = eighthPoints * 8; | |
411 | 2 | volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number); | |
412 | 2 | } | |
413 | #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ | ||
414 | |||
415 | #endif /* INCLUDED_volk_32f_tanh_32f_u_H */ | ||
416 |