GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_tanh_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 151 153 98.7%
Functions: 8 8 100.0%
Branches: 18 20 90.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_tanh_32f
12 *
13 * \b Overview
14 *
15 * Computes the hyperbolic tangent of each element of the aVector:
16 *
17 * c[i] = tanh(a[i])
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points)
22 * \endcode
23 *
24 * \b Inputs
25 * \li aVector: The buffer of points.
26 * \li num_points: The number of values in input buffer.
27 *
28 * \b Outputs
29 * \li cVector: The output buffer.
30 *
31 * \b Example
32 * \code
33 * int N = 10;
34 * unsigned int alignment = volk_get_alignment();
35 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
36 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
37 *
38 * for(unsigned int ii = 0; ii < N; ++ii){
39 * // the approximate artanh(x) for x<1
40 * float x = (float)ii / (float)N;
41 * in[ii] = 0.5 * std::log((1.f+x)/(1.f-x));
42 * }
43 *
44 * volk_32f_tanh_32f(out, in, N);
45 *
46 * for(unsigned int ii = 0; ii < N; ++ii){
47 * printf("out(%i) = %f\n", ii, out[ii]);
48 * }
49 *
50 * volk_free(in);
51 * volk_free(out);
52 * \endcode
53 */
54
55 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
56 #define INCLUDED_volk_32f_tanh_32f_a_H
57
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <string.h>
62
63
64 #ifdef LV_HAVE_GENERIC
65
66 static inline void
67 2 volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
68 {
69 2 unsigned int number = 0;
70 2 float* cPtr = cVector;
71 2 const float* aPtr = aVector;
72
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; number < num_points; number++) {
73 262142 *cPtr++ = tanhf(*aPtr++);
74 }
75 2 }
76
77 #endif /* LV_HAVE_GENERIC */
78
79
80 #ifdef LV_HAVE_GENERIC
81
82 static inline void
83 14 volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
84 {
85 14 float* cPtr = cVector;
86 14 const float* aPtr = aVector;
87
2/2
✓ Branch 0 taken 262210 times.
✓ Branch 1 taken 14 times.
262224 for (unsigned int number = 0; number < num_points; number++) {
88
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 262210 times.
262210 if (*aPtr > 4.97)
89 *cPtr++ = 1;
90
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 262210 times.
262210 else if (*aPtr <= -4.97)
91 *cPtr++ = -1;
92 else {
93 262210 float x2 = (*aPtr) * (*aPtr);
94 262210 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95 262210 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
96 262210 *cPtr++ = a / b;
97 262210 aPtr++;
98 }
99 }
100 14 }
101
102 #endif /* LV_HAVE_GENERIC */
103
104
105 #ifdef LV_HAVE_SSE
106 #include <xmmintrin.h>
107
108 static inline void
109 2 volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
110 {
111 2 unsigned int number = 0;
112 2 const unsigned int quarterPoints = num_points / 4;
113
114 2 float* cPtr = cVector;
115 2 const float* aPtr = aVector;
116
117 __m128 aVal, cVal, x2, a, b;
118 __m128 const1, const2, const3, const4, const5, const6;
119 2 const1 = _mm_set_ps1(135135.0f);
120 2 const2 = _mm_set_ps1(17325.0f);
121 2 const3 = _mm_set_ps1(378.0f);
122 2 const4 = _mm_set_ps1(62370.0f);
123 2 const5 = _mm_set_ps1(3150.0f);
124 2 const6 = _mm_set_ps1(28.0f);
125
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
126
127 65534 aVal = _mm_load_ps(aPtr);
128 65534 x2 = _mm_mul_ps(aVal, aVal);
129 393204 a = _mm_mul_ps(
130 aVal,
131 _mm_add_ps(
132 const1,
133 _mm_mul_ps(x2,
134 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
135 393204 b = _mm_add_ps(
136 const1,
137 _mm_mul_ps(
138 x2,
139 _mm_add_ps(const4,
140 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
141
142 65534 cVal = _mm_div_ps(a, b);
143
144 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
145
146 65534 aPtr += 4;
147 65534 cPtr += 4;
148 }
149
150 2 number = quarterPoints * 4;
151 2 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
152 2 }
153 #endif /* LV_HAVE_SSE */
154
155
156 #ifdef LV_HAVE_AVX
157 #include <immintrin.h>
158
159 static inline void
160 2 volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
161 {
162 2 unsigned int number = 0;
163 2 const unsigned int eighthPoints = num_points / 8;
164
165 2 float* cPtr = cVector;
166 2 const float* aPtr = aVector;
167
168 __m256 aVal, cVal, x2, a, b;
169 __m256 const1, const2, const3, const4, const5, const6;
170 2 const1 = _mm256_set1_ps(135135.0f);
171 2 const2 = _mm256_set1_ps(17325.0f);
172 2 const3 = _mm256_set1_ps(378.0f);
173 2 const4 = _mm256_set1_ps(62370.0f);
174 2 const5 = _mm256_set1_ps(3150.0f);
175 2 const6 = _mm256_set1_ps(28.0f);
176
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
177
178 32766 aVal = _mm256_load_ps(aPtr);
179 32766 x2 = _mm256_mul_ps(aVal, aVal);
180 196596 a = _mm256_mul_ps(
181 aVal,
182 _mm256_add_ps(
183 const1,
184 _mm256_mul_ps(
185 x2,
186 _mm256_add_ps(const2,
187 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
188 196596 b = _mm256_add_ps(
189 const1,
190 _mm256_mul_ps(
191 x2,
192 _mm256_add_ps(
193 const4,
194 _mm256_mul_ps(x2,
195 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
196
197 32766 cVal = _mm256_div_ps(a, b);
198
199 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
200
201 32766 aPtr += 8;
202 32766 cPtr += 8;
203 }
204
205 2 number = eighthPoints * 8;
206 2 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
207 2 }
208 #endif /* LV_HAVE_AVX */
209
210 #if LV_HAVE_AVX && LV_HAVE_FMA
211 #include <immintrin.h>
212
213 static inline void
214 2 volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
215 {
216 2 unsigned int number = 0;
217 2 const unsigned int eighthPoints = num_points / 8;
218
219 2 float* cPtr = cVector;
220 2 const float* aPtr = aVector;
221
222 __m256 aVal, cVal, x2, a, b;
223 __m256 const1, const2, const3, const4, const5, const6;
224 2 const1 = _mm256_set1_ps(135135.0f);
225 2 const2 = _mm256_set1_ps(17325.0f);
226 2 const3 = _mm256_set1_ps(378.0f);
227 2 const4 = _mm256_set1_ps(62370.0f);
228 2 const5 = _mm256_set1_ps(3150.0f);
229 2 const6 = _mm256_set1_ps(28.0f);
230
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
231
232 32766 aVal = _mm256_load_ps(aPtr);
233 32766 x2 = _mm256_mul_ps(aVal, aVal);
234 131064 a = _mm256_mul_ps(
235 aVal,
236 _mm256_fmadd_ps(
237 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
238 98298 b = _mm256_fmadd_ps(
239 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
240
241 32766 cVal = _mm256_div_ps(a, b);
242
243 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
244
245 32766 aPtr += 8;
246 32766 cPtr += 8;
247 }
248
249 2 number = eighthPoints * 8;
250 2 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
251 2 }
252 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
253
254 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */
255
256
257 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
258 #define INCLUDED_volk_32f_tanh_32f_u_H
259
260 #include <inttypes.h>
261 #include <math.h>
262 #include <stdio.h>
263 #include <string.h>
264
265
266 #ifdef LV_HAVE_SSE
267 #include <xmmintrin.h>
268
269 static inline void
270 2 volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
271 {
272 2 unsigned int number = 0;
273 2 const unsigned int quarterPoints = num_points / 4;
274
275 2 float* cPtr = cVector;
276 2 const float* aPtr = aVector;
277
278 __m128 aVal, cVal, x2, a, b;
279 __m128 const1, const2, const3, const4, const5, const6;
280 2 const1 = _mm_set_ps1(135135.0f);
281 2 const2 = _mm_set_ps1(17325.0f);
282 2 const3 = _mm_set_ps1(378.0f);
283 2 const4 = _mm_set_ps1(62370.0f);
284 2 const5 = _mm_set_ps1(3150.0f);
285 2 const6 = _mm_set_ps1(28.0f);
286
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
287
288 65534 aVal = _mm_loadu_ps(aPtr);
289 65534 x2 = _mm_mul_ps(aVal, aVal);
290 393204 a = _mm_mul_ps(
291 aVal,
292 _mm_add_ps(
293 const1,
294 _mm_mul_ps(x2,
295 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
296 393204 b = _mm_add_ps(
297 const1,
298 _mm_mul_ps(
299 x2,
300 _mm_add_ps(const4,
301 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
302
303 65534 cVal = _mm_div_ps(a, b);
304
305 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
306
307 65534 aPtr += 4;
308 65534 cPtr += 4;
309 }
310
311 2 number = quarterPoints * 4;
312 2 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
313 2 }
314 #endif /* LV_HAVE_SSE */
315
316
317 #ifdef LV_HAVE_AVX
318 #include <immintrin.h>
319
320 static inline void
321 2 volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
322 {
323 2 unsigned int number = 0;
324 2 const unsigned int eighthPoints = num_points / 8;
325
326 2 float* cPtr = cVector;
327 2 const float* aPtr = aVector;
328
329 __m256 aVal, cVal, x2, a, b;
330 __m256 const1, const2, const3, const4, const5, const6;
331 2 const1 = _mm256_set1_ps(135135.0f);
332 2 const2 = _mm256_set1_ps(17325.0f);
333 2 const3 = _mm256_set1_ps(378.0f);
334 2 const4 = _mm256_set1_ps(62370.0f);
335 2 const5 = _mm256_set1_ps(3150.0f);
336 2 const6 = _mm256_set1_ps(28.0f);
337
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
338
339 32766 aVal = _mm256_loadu_ps(aPtr);
340 32766 x2 = _mm256_mul_ps(aVal, aVal);
341 196596 a = _mm256_mul_ps(
342 aVal,
343 _mm256_add_ps(
344 const1,
345 _mm256_mul_ps(
346 x2,
347 _mm256_add_ps(const2,
348 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
349 196596 b = _mm256_add_ps(
350 const1,
351 _mm256_mul_ps(
352 x2,
353 _mm256_add_ps(
354 const4,
355 _mm256_mul_ps(x2,
356 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
357
358 32766 cVal = _mm256_div_ps(a, b);
359
360 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
361
362 32766 aPtr += 8;
363 32766 cPtr += 8;
364 }
365
366 2 number = eighthPoints * 8;
367 2 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
368 2 }
369 #endif /* LV_HAVE_AVX */
370
371 #if LV_HAVE_AVX && LV_HAVE_FMA
372 #include <immintrin.h>
373
374 static inline void
375 2 volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
376 {
377 2 unsigned int number = 0;
378 2 const unsigned int eighthPoints = num_points / 8;
379
380 2 float* cPtr = cVector;
381 2 const float* aPtr = aVector;
382
383 __m256 aVal, cVal, x2, a, b;
384 __m256 const1, const2, const3, const4, const5, const6;
385 2 const1 = _mm256_set1_ps(135135.0f);
386 2 const2 = _mm256_set1_ps(17325.0f);
387 2 const3 = _mm256_set1_ps(378.0f);
388 2 const4 = _mm256_set1_ps(62370.0f);
389 2 const5 = _mm256_set1_ps(3150.0f);
390 2 const6 = _mm256_set1_ps(28.0f);
391
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
392
393 32766 aVal = _mm256_loadu_ps(aPtr);
394 32766 x2 = _mm256_mul_ps(aVal, aVal);
395 131064 a = _mm256_mul_ps(
396 aVal,
397 _mm256_fmadd_ps(
398 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
399 98298 b = _mm256_fmadd_ps(
400 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
401
402 32766 cVal = _mm256_div_ps(a, b);
403
404 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
405
406 32766 aPtr += 8;
407 32766 cPtr += 8;
408 }
409
410 2 number = eighthPoints * 8;
411 2 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
412 2 }
413 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
414
415 #endif /* INCLUDED_volk_32f_tanh_32f_u_H */
416