GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_tan_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 317 317 100.0%
Functions: 7 7 100.0%
Branches: 38 38 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_tan_32f
12 *
13 * \b Overview
14 *
15 * Computes the tangent of each element of the aVector.
16 *
17 * b[i] = tan(a[i])
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_tan_32f(float* bVector, const float* aVector, unsigned int num_points)
22 * \endcode
23 *
24 * \b Inputs
25 * \li aVector: The buffer of points.
26 * \li num_points: The number of values in input buffer.
27 *
28 * \b Outputs
29 * \li bVector: The output buffer.
30 *
31 * \b Example
32 * Calculate tan(theta) for common angles.
33 * \code
34 * int N = 10;
35 * unsigned int alignment = volk_get_alignment();
36 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
37 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
38 *
39 * in[0] = 0.000;
40 * in[1] = 0.524;
41 * in[2] = 0.785;
42 * in[3] = 1.047;
43 * in[4] = 1.571 ;
44 * in[5] = 1.571 ;
45 * in[6] = -1.047;
46 * in[7] = -0.785;
47 * in[8] = -0.524;
48 * in[9] = -0.000;
49 *
50 * volk_32f_tan_32f(out, in, N);
51 *
52 * for(unsigned int ii = 0; ii < N; ++ii){
53 * printf("tan(%1.3f) = %1.3f\n", in[ii], out[ii]);
54 * }
55 *
56 * volk_free(in);
57 * volk_free(out);
58 * \endcode
59 */
60
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64
65 #ifndef INCLUDED_volk_32f_tan_32f_a_H
66 #define INCLUDED_volk_32f_tan_32f_a_H
67
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70
71 static inline void
72 2 volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
73 {
74 2 float* bPtr = bVector;
75 2 const float* aPtr = aVector;
76
77 2 unsigned int number = 0;
78 2 unsigned int eighthPoints = num_points / 8;
79 2 unsigned int i = 0;
80
81 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
82 fzeroes;
83 __m256 sine, cosine, tangent, condition1, condition2, condition3;
84 __m256i q, r, ones, twos, fours;
85
86 2 m4pi = _mm256_set1_ps(1.273239545);
87 2 pio4A = _mm256_set1_ps(0.78515625);
88 2 pio4B = _mm256_set1_ps(0.241876e-3);
89 2 ffours = _mm256_set1_ps(4.0);
90 2 ftwos = _mm256_set1_ps(2.0);
91 2 fones = _mm256_set1_ps(1.0);
92 2 fzeroes = _mm256_setzero_ps();
93 2 ones = _mm256_set1_epi32(1);
94 2 twos = _mm256_set1_epi32(2);
95 2 fours = _mm256_set1_epi32(4);
96
97 2 cp1 = _mm256_set1_ps(1.0);
98 2 cp2 = _mm256_set1_ps(0.83333333e-1);
99 2 cp3 = _mm256_set1_ps(0.2777778e-2);
100 2 cp4 = _mm256_set1_ps(0.49603e-4);
101 2 cp5 = _mm256_set1_ps(0.551e-6);
102
103
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
104 32766 aVal = _mm256_load_ps(aPtr);
105 98298 s = _mm256_sub_ps(aVal,
106 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
107 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
108 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
109 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
110
111 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
112 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
113
114 65532 s = _mm256_div_ps(
115 s,
116 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
117 32766 s = _mm256_mul_ps(s, s);
118 // Evaluate Taylor series
119 131064 s = _mm256_mul_ps(
120 _mm256_fmadd_ps(
121 _mm256_fmsub_ps(
122 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
123 s,
124 cp1),
125 s);
126
127
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
128 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
129 }
130 32766 s = _mm256_div_ps(s, ftwos);
131
132 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
133 32766 cosine = _mm256_sub_ps(fones, s);
134
135 65532 condition1 = _mm256_cmp_ps(
136 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
137 fzeroes,
138 _CMP_NEQ_UQ);
139 98298 condition2 = _mm256_cmp_ps(
140 _mm256_cmp_ps(
141 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
142 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
143 _CMP_NEQ_UQ);
144 65532 condition3 = _mm256_cmp_ps(
145 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
146 fzeroes,
147 _CMP_NEQ_UQ);
148
149 32766 __m256 temp = cosine;
150 cosine =
151 98298 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
152 98298 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
153 131064 sine = _mm256_sub_ps(
154 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
155 131064 cosine = _mm256_sub_ps(
156 cosine,
157 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
158 32766 tangent = _mm256_div_ps(sine, cosine);
159 _mm256_store_ps(bPtr, tangent);
160 32766 aPtr += 8;
161 32766 bPtr += 8;
162 }
163
164 2 number = eighthPoints * 8;
165
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
166 14 *bPtr++ = tan(*aPtr++);
167 }
168 2 }
169
170 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
171
172 #ifdef LV_HAVE_AVX2
173 #include <immintrin.h>
174
175 static inline void
176 2 volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
177 {
178 2 float* bPtr = bVector;
179 2 const float* aPtr = aVector;
180
181 2 unsigned int number = 0;
182 2 unsigned int eighthPoints = num_points / 8;
183 2 unsigned int i = 0;
184
185 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
186 fzeroes;
187 __m256 sine, cosine, tangent, condition1, condition2, condition3;
188 __m256i q, r, ones, twos, fours;
189
190 2 m4pi = _mm256_set1_ps(1.273239545);
191 2 pio4A = _mm256_set1_ps(0.78515625);
192 2 pio4B = _mm256_set1_ps(0.241876e-3);
193 2 ffours = _mm256_set1_ps(4.0);
194 2 ftwos = _mm256_set1_ps(2.0);
195 2 fones = _mm256_set1_ps(1.0);
196 2 fzeroes = _mm256_setzero_ps();
197 2 ones = _mm256_set1_epi32(1);
198 2 twos = _mm256_set1_epi32(2);
199 2 fours = _mm256_set1_epi32(4);
200
201 2 cp1 = _mm256_set1_ps(1.0);
202 2 cp2 = _mm256_set1_ps(0.83333333e-1);
203 2 cp3 = _mm256_set1_ps(0.2777778e-2);
204 2 cp4 = _mm256_set1_ps(0.49603e-4);
205 2 cp5 = _mm256_set1_ps(0.551e-6);
206
207
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
208 32766 aVal = _mm256_load_ps(aPtr);
209 98298 s = _mm256_sub_ps(aVal,
210 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
211 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
212 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
213 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
214
215 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
216 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
217
218 65532 s = _mm256_div_ps(
219 s,
220 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
221 32766 s = _mm256_mul_ps(s, s);
222 // Evaluate Taylor series
223 262128 s = _mm256_mul_ps(
224 _mm256_add_ps(
225 _mm256_mul_ps(
226 _mm256_sub_ps(
227 _mm256_mul_ps(
228 _mm256_add_ps(
229 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
230 s),
231 cp3),
232 s),
233 cp2),
234 s),
235 cp1),
236 s);
237
238
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
239 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
240 }
241 32766 s = _mm256_div_ps(s, ftwos);
242
243 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
244 32766 cosine = _mm256_sub_ps(fones, s);
245
246 65532 condition1 = _mm256_cmp_ps(
247 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
248 fzeroes,
249 _CMP_NEQ_UQ);
250 98298 condition2 = _mm256_cmp_ps(
251 _mm256_cmp_ps(
252 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
253 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
254 _CMP_NEQ_UQ);
255 65532 condition3 = _mm256_cmp_ps(
256 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
257 fzeroes,
258 _CMP_NEQ_UQ);
259
260 32766 __m256 temp = cosine;
261 cosine =
262 98298 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
263 98298 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
264 131064 sine = _mm256_sub_ps(
265 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
266 131064 cosine = _mm256_sub_ps(
267 cosine,
268 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
269 32766 tangent = _mm256_div_ps(sine, cosine);
270 _mm256_store_ps(bPtr, tangent);
271 32766 aPtr += 8;
272 32766 bPtr += 8;
273 }
274
275 2 number = eighthPoints * 8;
276
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
277 14 *bPtr++ = tan(*aPtr++);
278 }
279 2 }
280
281 #endif /* LV_HAVE_AVX2 for aligned */
282
283 #ifdef LV_HAVE_SSE4_1
284 #include <smmintrin.h>
285
286 static inline void
287 2 volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
288 {
289 2 float* bPtr = bVector;
290 2 const float* aPtr = aVector;
291
292 2 unsigned int number = 0;
293 2 unsigned int quarterPoints = num_points / 4;
294 2 unsigned int i = 0;
295
296 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
297 fzeroes;
298 __m128 sine, cosine, tangent, condition1, condition2, condition3;
299 __m128i q, r, ones, twos, fours;
300
301 2 m4pi = _mm_set1_ps(1.273239545);
302 2 pio4A = _mm_set1_ps(0.78515625);
303 2 pio4B = _mm_set1_ps(0.241876e-3);
304 2 ffours = _mm_set1_ps(4.0);
305 2 ftwos = _mm_set1_ps(2.0);
306 2 fones = _mm_set1_ps(1.0);
307 2 fzeroes = _mm_setzero_ps();
308 2 ones = _mm_set1_epi32(1);
309 2 twos = _mm_set1_epi32(2);
310 2 fours = _mm_set1_epi32(4);
311
312 2 cp1 = _mm_set1_ps(1.0);
313 2 cp2 = _mm_set1_ps(0.83333333e-1);
314 2 cp3 = _mm_set1_ps(0.2777778e-2);
315 2 cp4 = _mm_set1_ps(0.49603e-4);
316 2 cp5 = _mm_set1_ps(0.551e-6);
317
318
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
319 65534 aVal = _mm_load_ps(aPtr);
320 262136 s = _mm_sub_ps(aVal,
321 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
322 131068 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
323 131068 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
324
325 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
326 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
327
328 131068 s = _mm_div_ps(
329 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
330 65534 s = _mm_mul_ps(s, s);
331 // Evaluate Taylor series
332 524272 s = _mm_mul_ps(
333 _mm_add_ps(
334 _mm_mul_ps(
335 _mm_sub_ps(
336 _mm_mul_ps(
337 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
338 cp3),
339 s),
340 cp2),
341 s),
342 cp1),
343 s);
344
345
2/2
✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
262136 for (i = 0; i < 3; i++) {
346 393204 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
347 }
348 65534 s = _mm_div_ps(s, ftwos);
349
350 196602 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
351 65534 cosine = _mm_sub_ps(fones, s);
352
353 262136 condition1 = _mm_cmpneq_ps(
354 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
355 327670 condition2 = _mm_cmpneq_ps(
356 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
357 _mm_cmplt_ps(aVal, fzeroes));
358 196602 condition3 = _mm_cmpneq_ps(
359 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
360
361 65534 __m128 temp = cosine;
362 196602 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
363 196602 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
364 sine =
365 262136 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
366 262136 cosine = _mm_sub_ps(
367 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
368 65534 tangent = _mm_div_ps(sine, cosine);
369 _mm_store_ps(bPtr, tangent);
370 65534 aPtr += 4;
371 65534 bPtr += 4;
372 }
373
374 2 number = quarterPoints * 4;
375
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
376 6 *bPtr++ = tanf(*aPtr++);
377 }
378 2 }
379
380 #endif /* LV_HAVE_SSE4_1 for aligned */
381
382
383 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
384
385 #ifndef INCLUDED_volk_32f_tan_32f_u_H
386 #define INCLUDED_volk_32f_tan_32f_u_H
387
388 #if LV_HAVE_AVX2 && LV_HAVE_FMA
389 #include <immintrin.h>
390
391 static inline void
392 2 volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
393 {
394 2 float* bPtr = bVector;
395 2 const float* aPtr = aVector;
396
397 2 unsigned int number = 0;
398 2 unsigned int eighthPoints = num_points / 8;
399 2 unsigned int i = 0;
400
401 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
402 fzeroes;
403 __m256 sine, cosine, tangent, condition1, condition2, condition3;
404 __m256i q, r, ones, twos, fours;
405
406 2 m4pi = _mm256_set1_ps(1.273239545);
407 2 pio4A = _mm256_set1_ps(0.78515625);
408 2 pio4B = _mm256_set1_ps(0.241876e-3);
409 2 ffours = _mm256_set1_ps(4.0);
410 2 ftwos = _mm256_set1_ps(2.0);
411 2 fones = _mm256_set1_ps(1.0);
412 2 fzeroes = _mm256_setzero_ps();
413 2 ones = _mm256_set1_epi32(1);
414 2 twos = _mm256_set1_epi32(2);
415 2 fours = _mm256_set1_epi32(4);
416
417 2 cp1 = _mm256_set1_ps(1.0);
418 2 cp2 = _mm256_set1_ps(0.83333333e-1);
419 2 cp3 = _mm256_set1_ps(0.2777778e-2);
420 2 cp4 = _mm256_set1_ps(0.49603e-4);
421 2 cp5 = _mm256_set1_ps(0.551e-6);
422
423
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
424 32766 aVal = _mm256_loadu_ps(aPtr);
425 98298 s = _mm256_sub_ps(aVal,
426 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
427 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
428 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
429 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
430
431 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
432 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
433
434 65532 s = _mm256_div_ps(
435 s,
436 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
437 32766 s = _mm256_mul_ps(s, s);
438 // Evaluate Taylor series
439 131064 s = _mm256_mul_ps(
440 _mm256_fmadd_ps(
441 _mm256_fmsub_ps(
442 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
443 s,
444 cp1),
445 s);
446
447
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
448 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
449 }
450 32766 s = _mm256_div_ps(s, ftwos);
451
452 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
453 32766 cosine = _mm256_sub_ps(fones, s);
454
455 65532 condition1 = _mm256_cmp_ps(
456 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
457 fzeroes,
458 _CMP_NEQ_UQ);
459 98298 condition2 = _mm256_cmp_ps(
460 _mm256_cmp_ps(
461 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
462 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
463 _CMP_NEQ_UQ);
464 65532 condition3 = _mm256_cmp_ps(
465 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
466 fzeroes,
467 _CMP_NEQ_UQ);
468
469 32766 __m256 temp = cosine;
470 cosine =
471 98298 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
472 98298 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
473 131064 sine = _mm256_sub_ps(
474 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
475 131064 cosine = _mm256_sub_ps(
476 cosine,
477 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
478 32766 tangent = _mm256_div_ps(sine, cosine);
479 _mm256_storeu_ps(bPtr, tangent);
480 32766 aPtr += 8;
481 32766 bPtr += 8;
482 }
483
484 2 number = eighthPoints * 8;
485
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
486 14 *bPtr++ = tan(*aPtr++);
487 }
488 2 }
489
490 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
491
492 #ifdef LV_HAVE_AVX2
493 #include <immintrin.h>
494
495 static inline void
496 2 volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
497 {
498 2 float* bPtr = bVector;
499 2 const float* aPtr = aVector;
500
501 2 unsigned int number = 0;
502 2 unsigned int eighthPoints = num_points / 8;
503 2 unsigned int i = 0;
504
505 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
506 fzeroes;
507 __m256 sine, cosine, tangent, condition1, condition2, condition3;
508 __m256i q, r, ones, twos, fours;
509
510 2 m4pi = _mm256_set1_ps(1.273239545);
511 2 pio4A = _mm256_set1_ps(0.78515625);
512 2 pio4B = _mm256_set1_ps(0.241876e-3);
513 2 ffours = _mm256_set1_ps(4.0);
514 2 ftwos = _mm256_set1_ps(2.0);
515 2 fones = _mm256_set1_ps(1.0);
516 2 fzeroes = _mm256_setzero_ps();
517 2 ones = _mm256_set1_epi32(1);
518 2 twos = _mm256_set1_epi32(2);
519 2 fours = _mm256_set1_epi32(4);
520
521 2 cp1 = _mm256_set1_ps(1.0);
522 2 cp2 = _mm256_set1_ps(0.83333333e-1);
523 2 cp3 = _mm256_set1_ps(0.2777778e-2);
524 2 cp4 = _mm256_set1_ps(0.49603e-4);
525 2 cp5 = _mm256_set1_ps(0.551e-6);
526
527
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
528 32766 aVal = _mm256_loadu_ps(aPtr);
529 98298 s = _mm256_sub_ps(aVal,
530 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
531 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
532 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
533 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
534
535 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
536 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
537
538 65532 s = _mm256_div_ps(
539 s,
540 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
541 32766 s = _mm256_mul_ps(s, s);
542 // Evaluate Taylor series
543 262128 s = _mm256_mul_ps(
544 _mm256_add_ps(
545 _mm256_mul_ps(
546 _mm256_sub_ps(
547 _mm256_mul_ps(
548 _mm256_add_ps(
549 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
550 s),
551 cp3),
552 s),
553 cp2),
554 s),
555 cp1),
556 s);
557
558
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
559 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
560 }
561 32766 s = _mm256_div_ps(s, ftwos);
562
563 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
564 32766 cosine = _mm256_sub_ps(fones, s);
565
566 65532 condition1 = _mm256_cmp_ps(
567 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
568 fzeroes,
569 _CMP_NEQ_UQ);
570 98298 condition2 = _mm256_cmp_ps(
571 _mm256_cmp_ps(
572 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
573 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
574 _CMP_NEQ_UQ);
575 65532 condition3 = _mm256_cmp_ps(
576 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
577 fzeroes,
578 _CMP_NEQ_UQ);
579
580 32766 __m256 temp = cosine;
581 cosine =
582 98298 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
583 98298 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
584 131064 sine = _mm256_sub_ps(
585 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
586 131064 cosine = _mm256_sub_ps(
587 cosine,
588 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
589 32766 tangent = _mm256_div_ps(sine, cosine);
590 _mm256_storeu_ps(bPtr, tangent);
591 32766 aPtr += 8;
592 32766 bPtr += 8;
593 }
594
595 2 number = eighthPoints * 8;
596
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
597 14 *bPtr++ = tan(*aPtr++);
598 }
599 2 }
600
601 #endif /* LV_HAVE_AVX2 for unaligned */
602
603
604 #ifdef LV_HAVE_SSE4_1
605 #include <smmintrin.h>
606
607 static inline void
608 2 volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
609 {
610 2 float* bPtr = bVector;
611 2 const float* aPtr = aVector;
612
613 2 unsigned int number = 0;
614 2 unsigned int quarterPoints = num_points / 4;
615 2 unsigned int i = 0;
616
617 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
618 fzeroes;
619 __m128 sine, cosine, tangent, condition1, condition2, condition3;
620 __m128i q, r, ones, twos, fours;
621
622 2 m4pi = _mm_set1_ps(1.273239545);
623 2 pio4A = _mm_set1_ps(0.78515625);
624 2 pio4B = _mm_set1_ps(0.241876e-3);
625 2 ffours = _mm_set1_ps(4.0);
626 2 ftwos = _mm_set1_ps(2.0);
627 2 fones = _mm_set1_ps(1.0);
628 2 fzeroes = _mm_setzero_ps();
629 2 ones = _mm_set1_epi32(1);
630 2 twos = _mm_set1_epi32(2);
631 2 fours = _mm_set1_epi32(4);
632
633 2 cp1 = _mm_set1_ps(1.0);
634 2 cp2 = _mm_set1_ps(0.83333333e-1);
635 2 cp3 = _mm_set1_ps(0.2777778e-2);
636 2 cp4 = _mm_set1_ps(0.49603e-4);
637 2 cp5 = _mm_set1_ps(0.551e-6);
638
639
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
640 65534 aVal = _mm_loadu_ps(aPtr);
641 262136 s = _mm_sub_ps(aVal,
642 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
643 131068 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
644 131068 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
645
646 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
647 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
648
649 131068 s = _mm_div_ps(
650 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
651 65534 s = _mm_mul_ps(s, s);
652 // Evaluate Taylor series
653 524272 s = _mm_mul_ps(
654 _mm_add_ps(
655 _mm_mul_ps(
656 _mm_sub_ps(
657 _mm_mul_ps(
658 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
659 cp3),
660 s),
661 cp2),
662 s),
663 cp1),
664 s);
665
666
2/2
✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
262136 for (i = 0; i < 3; i++) {
667 393204 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
668 }
669 65534 s = _mm_div_ps(s, ftwos);
670
671 196602 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
672 65534 cosine = _mm_sub_ps(fones, s);
673
674 262136 condition1 = _mm_cmpneq_ps(
675 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
676 327670 condition2 = _mm_cmpneq_ps(
677 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
678 _mm_cmplt_ps(aVal, fzeroes));
679 196602 condition3 = _mm_cmpneq_ps(
680 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
681
682 65534 __m128 temp = cosine;
683 196602 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
684 196602 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
685 sine =
686 262136 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
687 262136 cosine = _mm_sub_ps(
688 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
689 65534 tangent = _mm_div_ps(sine, cosine);
690 _mm_storeu_ps(bPtr, tangent);
691 65534 aPtr += 4;
692 65534 bPtr += 4;
693 }
694
695 2 number = quarterPoints * 4;
696
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
697 6 *bPtr++ = tanf(*aPtr++);
698 }
699 2 }
700
701 #endif /* LV_HAVE_SSE4_1 for unaligned */
702
703
704 #ifdef LV_HAVE_GENERIC
705
706 static inline void
707 2 volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
708 {
709 2 float* bPtr = bVector;
710 2 const float* aPtr = aVector;
711 2 unsigned int number = 0;
712
713
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; number < num_points; number++) {
714 262142 *bPtr++ = tanf(*aPtr++);
715 }
716 2 }
717 #endif /* LV_HAVE_GENERIC */
718
719
720 #ifdef LV_HAVE_NEON
721 #include <arm_neon.h>
722 #include <volk/volk_neon_intrinsics.h>
723
724 static inline void
725 volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
726 {
727 unsigned int number = 0;
728 unsigned int quarter_points = num_points / 4;
729 float* bVectorPtr = bVector;
730 const float* aVectorPtr = aVector;
731
732 float32x4_t b_vec;
733 float32x4_t a_vec;
734
735 for (number = 0; number < quarter_points; number++) {
736 a_vec = vld1q_f32(aVectorPtr);
737 // Prefetch next one, speeds things up
738 __VOLK_PREFETCH(aVectorPtr + 4);
739 b_vec = _vtanq_f32(a_vec);
740 vst1q_f32(bVectorPtr, b_vec);
741 // move pointers ahead
742 bVectorPtr += 4;
743 aVectorPtr += 4;
744 }
745
746 // Deal with the rest
747 for (number = quarter_points * 4; number < num_points; number++) {
748 *bVectorPtr++ = tanf(*aVectorPtr++);
749 }
750 }
751 #endif /* LV_HAVE_NEON */
752
753
754 #endif /* INCLUDED_volk_32f_tan_32f_u_H */
755