GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_sin_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 287 387 74.2%
Functions: 7 9 77.8%
Branches: 38 50 76.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_sin_32f
12 *
13 * \b Overview
14 *
15 * Computes the sine of the input vector and stores the results in the output vector.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32f_sin_32f(float* bVector, const float* aVector, unsigned int num_points)
20 * \endcode
21 *
22 * \b Inputs
23 * \li aVector: The input vector of floats.
24 * \li num_points: The number of data points.
25 *
26 * \b Outputs
27 * \li bVector: The output vector.
28 *
29 * \b Example
30 * Calculate sin(theta) for several common angles.
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 * in[0] = 0.000;
38 * in[1] = 0.524;
39 * in[2] = 0.786;
40 * in[3] = 1.047;
41 * in[4] = 1.571;
42 * in[5] = 1.571;
43 * in[6] = 2.094;
44 * in[7] = 2.356;
45 * in[8] = 2.618;
46 * in[9] = 3.142;
47 *
48 * volk_32f_sin_32f(out, in, N);
49 *
50 * for(unsigned int ii = 0; ii < N; ++ii){
51 * printf("sin(%1.3f) = %1.3f\n", in[ii], out[ii]);
52 * }
53 *
54 * volk_free(in);
55 * volk_free(out);
56 * \endcode
57 */
58
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62
63 #ifndef INCLUDED_volk_32f_sin_32f_a_H
64 #define INCLUDED_volk_32f_sin_32f_a_H
65 #ifdef LV_HAVE_AVX512F
66
67 #include <immintrin.h>
68 static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
69 const float* inVector,
70 unsigned int num_points)
71 {
72 float* sinPtr = sinVector;
73 const float* inPtr = inVector;
74
75 unsigned int number = 0;
76 unsigned int sixteenPoints = num_points / 16;
77 unsigned int i = 0;
78
79 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
80 fones;
81 __m512 sine, cosine;
82 __m512i q, zeros, ones, twos, fours;
83
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
95
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2, ltZero;
102
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
105 // s = fabs(aVal)
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107
108 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110 // r = q + q&1, q indicates quadrant, r gives
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
116
117 s = _mm512_div_ps(
118 s,
119 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120 s = _mm512_mul_ps(s, s);
121 // Evaluate Taylor series
122 s = _mm512_mul_ps(
123 _mm512_fmadd_ps(
124 _mm512_fmsub_ps(
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126 s,
127 cp1),
128 s);
129
130 for (i = 0; i < 3; i++)
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 s = _mm512_div_ps(s, ftwos);
133
134 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135 cosine = _mm512_sub_ps(fones, s);
136
137 condition1 = _mm512_cmpneq_epi32_mask(
138 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
139 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
140 condition2 = _mm512_kxor(
141 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
142
143 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
144 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
145 _mm512_store_ps(sinPtr, sine);
146 inPtr += 16;
147 sinPtr += 16;
148 }
149
150 number = sixteenPoints * 16;
151 for (; number < num_points; number++) {
152 *sinPtr++ = sinf(*inPtr++);
153 }
154 }
155 #endif
156 #if LV_HAVE_AVX2 && LV_HAVE_FMA
157 #include <immintrin.h>
158
159 static inline void
160 2 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
161 {
162 2 float* bPtr = bVector;
163 2 const float* aPtr = aVector;
164
165 2 unsigned int number = 0;
166 2 unsigned int eighthPoints = num_points / 8;
167 2 unsigned int i = 0;
168
169 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
170 fzeroes;
171 __m256 sine, cosine, condition1, condition2;
172 __m256i q, r, ones, twos, fours;
173
174 2 m4pi = _mm256_set1_ps(1.273239545);
175 2 pio4A = _mm256_set1_ps(0.78515625);
176 2 pio4B = _mm256_set1_ps(0.241876e-3);
177 2 ffours = _mm256_set1_ps(4.0);
178 2 ftwos = _mm256_set1_ps(2.0);
179 2 fones = _mm256_set1_ps(1.0);
180 2 fzeroes = _mm256_setzero_ps();
181 2 ones = _mm256_set1_epi32(1);
182 2 twos = _mm256_set1_epi32(2);
183 2 fours = _mm256_set1_epi32(4);
184
185 2 cp1 = _mm256_set1_ps(1.0);
186 2 cp2 = _mm256_set1_ps(0.83333333e-1);
187 2 cp3 = _mm256_set1_ps(0.2777778e-2);
188 2 cp4 = _mm256_set1_ps(0.49603e-4);
189 2 cp5 = _mm256_set1_ps(0.551e-6);
190
191
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
192 32766 aVal = _mm256_load_ps(aPtr);
193 98298 s = _mm256_sub_ps(aVal,
194 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
195 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
196 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
197 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
198
199 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
200 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
201
202 65532 s = _mm256_div_ps(
203 s,
204 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
205 32766 s = _mm256_mul_ps(s, s);
206 // Evaluate Taylor series
207 131064 s = _mm256_mul_ps(
208 _mm256_fmadd_ps(
209 _mm256_fmsub_ps(
210 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
211 s,
212 cp1),
213 s);
214
215
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
216 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
217 }
218 32766 s = _mm256_div_ps(s, ftwos);
219
220 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
221 32766 cosine = _mm256_sub_ps(fones, s);
222
223 65532 condition1 = _mm256_cmp_ps(
224 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
225 fzeroes,
226 _CMP_NEQ_UQ);
227 98298 condition2 = _mm256_cmp_ps(
228 _mm256_cmp_ps(
229 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
230 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
231 _CMP_NEQ_UQ);
232 // Need this condition only for cos
233 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
234 // twos), fours)), fzeroes);
235
236 sine =
237 98298 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
238 131064 sine = _mm256_sub_ps(
239 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
240 _mm256_store_ps(bPtr, sine);
241 32766 aPtr += 8;
242 32766 bPtr += 8;
243 }
244
245 2 number = eighthPoints * 8;
246
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
247 14 *bPtr++ = sin(*aPtr++);
248 }
249 2 }
250
251 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
252
253 #ifdef LV_HAVE_AVX2
254 #include <immintrin.h>
255
256 static inline void
257 2 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
258 {
259 2 float* bPtr = bVector;
260 2 const float* aPtr = aVector;
261
262 2 unsigned int number = 0;
263 2 unsigned int eighthPoints = num_points / 8;
264 2 unsigned int i = 0;
265
266 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
267 fzeroes;
268 __m256 sine, cosine, condition1, condition2;
269 __m256i q, r, ones, twos, fours;
270
271 2 m4pi = _mm256_set1_ps(1.273239545);
272 2 pio4A = _mm256_set1_ps(0.78515625);
273 2 pio4B = _mm256_set1_ps(0.241876e-3);
274 2 ffours = _mm256_set1_ps(4.0);
275 2 ftwos = _mm256_set1_ps(2.0);
276 2 fones = _mm256_set1_ps(1.0);
277 2 fzeroes = _mm256_setzero_ps();
278 2 ones = _mm256_set1_epi32(1);
279 2 twos = _mm256_set1_epi32(2);
280 2 fours = _mm256_set1_epi32(4);
281
282 2 cp1 = _mm256_set1_ps(1.0);
283 2 cp2 = _mm256_set1_ps(0.83333333e-1);
284 2 cp3 = _mm256_set1_ps(0.2777778e-2);
285 2 cp4 = _mm256_set1_ps(0.49603e-4);
286 2 cp5 = _mm256_set1_ps(0.551e-6);
287
288
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
289 32766 aVal = _mm256_load_ps(aPtr);
290 98298 s = _mm256_sub_ps(aVal,
291 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
292 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
293 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
294 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
295
296 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
297 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
298
299 65532 s = _mm256_div_ps(
300 s,
301 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
302 32766 s = _mm256_mul_ps(s, s);
303 // Evaluate Taylor series
304 262128 s = _mm256_mul_ps(
305 _mm256_add_ps(
306 _mm256_mul_ps(
307 _mm256_sub_ps(
308 _mm256_mul_ps(
309 _mm256_add_ps(
310 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
311 s),
312 cp3),
313 s),
314 cp2),
315 s),
316 cp1),
317 s);
318
319
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
320 294894 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
321 }
322 32766 s = _mm256_div_ps(s, ftwos);
323
324 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
325 32766 cosine = _mm256_sub_ps(fones, s);
326
327 65532 condition1 = _mm256_cmp_ps(
328 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
329 fzeroes,
330 _CMP_NEQ_UQ);
331 98298 condition2 = _mm256_cmp_ps(
332 _mm256_cmp_ps(
333 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
334 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
335 _CMP_NEQ_UQ);
336 // Need this condition only for cos
337 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
338 // twos), fours)), fzeroes);
339
340 sine =
341 98298 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
342 131064 sine = _mm256_sub_ps(
343 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
344 _mm256_store_ps(bPtr, sine);
345 32766 aPtr += 8;
346 32766 bPtr += 8;
347 }
348
349 2 number = eighthPoints * 8;
350
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
351 14 *bPtr++ = sin(*aPtr++);
352 }
353 2 }
354
355 #endif /* LV_HAVE_AVX2 for aligned */
356
357 #ifdef LV_HAVE_SSE4_1
358 #include <smmintrin.h>
359
360 static inline void
361 2 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
362 {
363 2 float* bPtr = bVector;
364 2 const float* aPtr = aVector;
365
366 2 unsigned int number = 0;
367 2 unsigned int quarterPoints = num_points / 4;
368 2 unsigned int i = 0;
369
370 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
371 fzeroes;
372 __m128 sine, cosine, condition1, condition2;
373 __m128i q, r, ones, twos, fours;
374
375 2 m4pi = _mm_set1_ps(1.273239545);
376 2 pio4A = _mm_set1_ps(0.78515625);
377 2 pio4B = _mm_set1_ps(0.241876e-3);
378 2 ffours = _mm_set1_ps(4.0);
379 2 ftwos = _mm_set1_ps(2.0);
380 2 fones = _mm_set1_ps(1.0);
381 2 fzeroes = _mm_setzero_ps();
382 2 ones = _mm_set1_epi32(1);
383 2 twos = _mm_set1_epi32(2);
384 2 fours = _mm_set1_epi32(4);
385
386 2 cp1 = _mm_set1_ps(1.0);
387 2 cp2 = _mm_set1_ps(0.83333333e-1);
388 2 cp3 = _mm_set1_ps(0.2777778e-2);
389 2 cp4 = _mm_set1_ps(0.49603e-4);
390 2 cp5 = _mm_set1_ps(0.551e-6);
391
392
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
393 65534 aVal = _mm_load_ps(aPtr);
394 262136 s = _mm_sub_ps(aVal,
395 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
396 131068 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
397 131068 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
398
399 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
400 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
401
402 131068 s = _mm_div_ps(
403 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
404 65534 s = _mm_mul_ps(s, s);
405 // Evaluate Taylor series
406 524272 s = _mm_mul_ps(
407 _mm_add_ps(
408 _mm_mul_ps(
409 _mm_sub_ps(
410 _mm_mul_ps(
411 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
412 cp3),
413 s),
414 cp2),
415 s),
416 cp1),
417 s);
418
419
2/2
✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
262136 for (i = 0; i < 3; i++) {
420 393204 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
421 }
422 65534 s = _mm_div_ps(s, ftwos);
423
424 196602 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
425 65534 cosine = _mm_sub_ps(fones, s);
426
427 262136 condition1 = _mm_cmpneq_ps(
428 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
429 327670 condition2 = _mm_cmpneq_ps(
430 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
431 _mm_cmplt_ps(aVal, fzeroes));
432 // Need this condition only for cos
433 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
434 // twos), fours)), fzeroes);
435
436 196602 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
437 sine =
438 262136 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
439 _mm_store_ps(bPtr, sine);
440 65534 aPtr += 4;
441 65534 bPtr += 4;
442 }
443
444 2 number = quarterPoints * 4;
445
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
446 6 *bPtr++ = sinf(*aPtr++);
447 }
448 2 }
449
450 #endif /* LV_HAVE_SSE4_1 for aligned */
451
452
453 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
454
455 #ifndef INCLUDED_volk_32f_sin_32f_u_H
456 #define INCLUDED_volk_32f_sin_32f_u_H
457
458 #ifdef LV_HAVE_AVX512F
459
460 #include <immintrin.h>
461 static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
462 const float* inVector,
463 unsigned int num_points)
464 {
465 float* sinPtr = sinVector;
466 const float* inPtr = inVector;
467
468 unsigned int number = 0;
469 unsigned int sixteenPoints = num_points / 16;
470 unsigned int i = 0;
471
472 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
473 fones;
474 __m512 sine, cosine;
475 __m512i q, zeros, ones, twos, fours;
476
477 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
478 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
479 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
480 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
481 ffours = _mm512_set1_ps(4.0);
482 ftwos = _mm512_set1_ps(2.0);
483 fones = _mm512_set1_ps(1.0);
484 zeros = _mm512_setzero_epi32();
485 ones = _mm512_set1_epi32(1);
486 twos = _mm512_set1_epi32(2);
487 fours = _mm512_set1_epi32(4);
488
489 cp1 = _mm512_set1_ps(1.0);
490 cp2 = _mm512_set1_ps(0.08333333333333333);
491 cp3 = _mm512_set1_ps(0.002777777777777778);
492 cp4 = _mm512_set1_ps(4.96031746031746e-05);
493 cp5 = _mm512_set1_ps(5.511463844797178e-07);
494 __mmask16 condition1, condition2, ltZero;
495
496 for (; number < sixteenPoints; number++) {
497 aVal = _mm512_loadu_ps(inPtr);
498 // s = fabs(aVal)
499 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
500
501 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
502 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
503 // r = q + q&1, q indicates quadrant, r gives
504 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
505
506 s = _mm512_fnmadd_ps(r, pio4A, s);
507 s = _mm512_fnmadd_ps(r, pio4B, s);
508 s = _mm512_fnmadd_ps(r, pio4C, s);
509
510 s = _mm512_div_ps(
511 s,
512 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
513 s = _mm512_mul_ps(s, s);
514 // Evaluate Taylor series
515 s = _mm512_mul_ps(
516 _mm512_fmadd_ps(
517 _mm512_fmsub_ps(
518 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
519 s,
520 cp1),
521 s);
522
523 for (i = 0; i < 3; i++)
524 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
525 s = _mm512_div_ps(s, ftwos);
526
527 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
528 cosine = _mm512_sub_ps(fones, s);
529
530 condition1 = _mm512_cmpneq_epi32_mask(
531 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
532 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
533 condition2 = _mm512_kxor(
534 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
535
536 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
537 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
538 _mm512_storeu_ps(sinPtr, sine);
539 inPtr += 16;
540 sinPtr += 16;
541 }
542
543 number = sixteenPoints * 16;
544 for (; number < num_points; number++) {
545 *sinPtr++ = sinf(*inPtr++);
546 }
547 }
548 #endif
549
550 #if LV_HAVE_AVX2 && LV_HAVE_FMA
551 #include <immintrin.h>
552
553 static inline void
554 2 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
555 {
556 2 float* bPtr = bVector;
557 2 const float* aPtr = aVector;
558
559 2 unsigned int number = 0;
560 2 unsigned int eighthPoints = num_points / 8;
561 2 unsigned int i = 0;
562
563 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
564 fzeroes;
565 __m256 sine, cosine, condition1, condition2;
566 __m256i q, r, ones, twos, fours;
567
568 2 m4pi = _mm256_set1_ps(1.273239545);
569 2 pio4A = _mm256_set1_ps(0.78515625);
570 2 pio4B = _mm256_set1_ps(0.241876e-3);
571 2 ffours = _mm256_set1_ps(4.0);
572 2 ftwos = _mm256_set1_ps(2.0);
573 2 fones = _mm256_set1_ps(1.0);
574 2 fzeroes = _mm256_setzero_ps();
575 2 ones = _mm256_set1_epi32(1);
576 2 twos = _mm256_set1_epi32(2);
577 2 fours = _mm256_set1_epi32(4);
578
579 2 cp1 = _mm256_set1_ps(1.0);
580 2 cp2 = _mm256_set1_ps(0.83333333e-1);
581 2 cp3 = _mm256_set1_ps(0.2777778e-2);
582 2 cp4 = _mm256_set1_ps(0.49603e-4);
583 2 cp5 = _mm256_set1_ps(0.551e-6);
584
585
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
586 32766 aVal = _mm256_loadu_ps(aPtr);
587 98298 s = _mm256_sub_ps(aVal,
588 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
589 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
590 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
591 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
592
593 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
594 65532 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
595
596 65532 s = _mm256_div_ps(
597 s,
598 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
599 32766 s = _mm256_mul_ps(s, s);
600 // Evaluate Taylor series
601 131064 s = _mm256_mul_ps(
602 _mm256_fmadd_ps(
603 _mm256_fmsub_ps(
604 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
605 s,
606 cp1),
607 s);
608
609
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
610 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
611 }
612 32766 s = _mm256_div_ps(s, ftwos);
613
614 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
615 32766 cosine = _mm256_sub_ps(fones, s);
616
617 65532 condition1 = _mm256_cmp_ps(
618 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
619 fzeroes,
620 _CMP_NEQ_UQ);
621 98298 condition2 = _mm256_cmp_ps(
622 _mm256_cmp_ps(
623 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
624 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
625 _CMP_NEQ_UQ);
626 // Need this condition only for cos
627 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
628 // twos), fours)), fzeroes);
629
630 sine =
631 98298 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
632 131064 sine = _mm256_sub_ps(
633 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
634 _mm256_storeu_ps(bPtr, sine);
635 32766 aPtr += 8;
636 32766 bPtr += 8;
637 }
638
639 2 number = eighthPoints * 8;
640
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
641 14 *bPtr++ = sin(*aPtr++);
642 }
643 2 }
644
645 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
646
647 #ifdef LV_HAVE_AVX2
648 #include <immintrin.h>
649
650 static inline void
651 2 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
652 {
653 2 float* bPtr = bVector;
654 2 const float* aPtr = aVector;
655
656 2 unsigned int number = 0;
657 2 unsigned int eighthPoints = num_points / 8;
658 2 unsigned int i = 0;
659
660 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
661 fzeroes;
662 __m256 sine, cosine, condition1, condition2;
663 __m256i q, r, ones, twos, fours;
664
665 2 m4pi = _mm256_set1_ps(1.273239545);
666 2 pio4A = _mm256_set1_ps(0.78515625);
667 2 pio4B = _mm256_set1_ps(0.241876e-3);
668 2 ffours = _mm256_set1_ps(4.0);
669 2 ftwos = _mm256_set1_ps(2.0);
670 2 fones = _mm256_set1_ps(1.0);
671 2 fzeroes = _mm256_setzero_ps();
672 2 ones = _mm256_set1_epi32(1);
673 2 twos = _mm256_set1_epi32(2);
674 2 fours = _mm256_set1_epi32(4);
675
676 2 cp1 = _mm256_set1_ps(1.0);
677 2 cp2 = _mm256_set1_ps(0.83333333e-1);
678 2 cp3 = _mm256_set1_ps(0.2777778e-2);
679 2 cp4 = _mm256_set1_ps(0.49603e-4);
680 2 cp5 = _mm256_set1_ps(0.551e-6);
681
682
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
683 32766 aVal = _mm256_loadu_ps(aPtr);
684 98298 s = _mm256_sub_ps(aVal,
685 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
686 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
687 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
688 65532 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
689
690 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
691 98298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
692
693 65532 s = _mm256_div_ps(
694 s,
695 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
696 32766 s = _mm256_mul_ps(s, s);
697 // Evaluate Taylor series
698 262128 s = _mm256_mul_ps(
699 _mm256_add_ps(
700 _mm256_mul_ps(
701 _mm256_sub_ps(
702 _mm256_mul_ps(
703 _mm256_add_ps(
704 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
705 s),
706 cp3),
707 s),
708 cp2),
709 s),
710 cp1),
711 s);
712
713
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++) {
714 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
715 }
716 32766 s = _mm256_div_ps(s, ftwos);
717
718 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
719 32766 cosine = _mm256_sub_ps(fones, s);
720
721 65532 condition1 = _mm256_cmp_ps(
722 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
723 fzeroes,
724 _CMP_NEQ_UQ);
725 98298 condition2 = _mm256_cmp_ps(
726 _mm256_cmp_ps(
727 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
728 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
729 _CMP_NEQ_UQ);
730 // Need this condition only for cos
731 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
732 // twos), fours)), fzeroes);
733
734 sine =
735 98298 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
736 131064 sine = _mm256_sub_ps(
737 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
738 _mm256_storeu_ps(bPtr, sine);
739 32766 aPtr += 8;
740 32766 bPtr += 8;
741 }
742
743 2 number = eighthPoints * 8;
744
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
745 14 *bPtr++ = sin(*aPtr++);
746 }
747 2 }
748
749 #endif /* LV_HAVE_AVX2 for unaligned */
750
751
752 #ifdef LV_HAVE_SSE4_1
753 #include <smmintrin.h>
754
755 static inline void
756 2 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
757 {
758 2 float* bPtr = bVector;
759 2 const float* aPtr = aVector;
760
761 2 unsigned int number = 0;
762 2 unsigned int quarterPoints = num_points / 4;
763 2 unsigned int i = 0;
764
765 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
766 fzeroes;
767 __m128 sine, cosine, condition1, condition2;
768 __m128i q, r, ones, twos, fours;
769
770 2 m4pi = _mm_set1_ps(1.273239545);
771 2 pio4A = _mm_set1_ps(0.78515625);
772 2 pio4B = _mm_set1_ps(0.241876e-3);
773 2 ffours = _mm_set1_ps(4.0);
774 2 ftwos = _mm_set1_ps(2.0);
775 2 fones = _mm_set1_ps(1.0);
776 2 fzeroes = _mm_setzero_ps();
777 2 ones = _mm_set1_epi32(1);
778 2 twos = _mm_set1_epi32(2);
779 2 fours = _mm_set1_epi32(4);
780
781 2 cp1 = _mm_set1_ps(1.0);
782 2 cp2 = _mm_set1_ps(0.83333333e-1);
783 2 cp3 = _mm_set1_ps(0.2777778e-2);
784 2 cp4 = _mm_set1_ps(0.49603e-4);
785 2 cp5 = _mm_set1_ps(0.551e-6);
786
787
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
788 65534 aVal = _mm_loadu_ps(aPtr);
789 262136 s = _mm_sub_ps(aVal,
790 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
791 131068 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
792 131068 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
793
794 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
795 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
796
797 131068 s = _mm_div_ps(
798 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
799 65534 s = _mm_mul_ps(s, s);
800 // Evaluate Taylor series
801 524272 s = _mm_mul_ps(
802 _mm_add_ps(
803 _mm_mul_ps(
804 _mm_sub_ps(
805 _mm_mul_ps(
806 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
807 cp3),
808 s),
809 cp2),
810 s),
811 cp1),
812 s);
813
814
2/2
✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
262136 for (i = 0; i < 3; i++) {
815 393204 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
816 }
817 65534 s = _mm_div_ps(s, ftwos);
818
819 196602 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
820 65534 cosine = _mm_sub_ps(fones, s);
821
822 262136 condition1 = _mm_cmpneq_ps(
823 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
824 327670 condition2 = _mm_cmpneq_ps(
825 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
826 _mm_cmplt_ps(aVal, fzeroes));
827
828 196602 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
829 sine =
830 262136 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
831 _mm_storeu_ps(bPtr, sine);
832 65534 aPtr += 4;
833 65534 bPtr += 4;
834 }
835
836 2 number = quarterPoints * 4;
837
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
838 6 *bPtr++ = sinf(*aPtr++);
839 }
840 2 }
841
842 #endif /* LV_HAVE_SSE4_1 for unaligned */
843
844
845 #ifdef LV_HAVE_GENERIC
846
847 static inline void
848 2 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
849 {
850 2 float* bPtr = bVector;
851 2 const float* aPtr = aVector;
852 2 unsigned int number = 0;
853
854
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
855 262142 *bPtr++ = sinf(*aPtr++);
856 }
857 2 }
858
859 #endif /* LV_HAVE_GENERIC */
860
861
862 #ifdef LV_HAVE_NEON
863 #include <arm_neon.h>
864 #include <volk/volk_neon_intrinsics.h>
865
866 static inline void
867 volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
868 {
869 unsigned int number = 0;
870 unsigned int quarter_points = num_points / 4;
871 float* bVectorPtr = bVector;
872 const float* aVectorPtr = aVector;
873
874 float32x4_t b_vec;
875 float32x4_t a_vec;
876
877 for (number = 0; number < quarter_points; number++) {
878 a_vec = vld1q_f32(aVectorPtr);
879 // Prefetch next one, speeds things up
880 __VOLK_PREFETCH(aVectorPtr + 4);
881 b_vec = _vsinq_f32(a_vec);
882 vst1q_f32(bVectorPtr, b_vec);
883 // move pointers ahead
884 bVectorPtr += 4;
885 aVectorPtr += 4;
886 }
887
888 // Deal with the rest
889 for (number = quarter_points * 4; number < num_points; number++) {
890 *bVectorPtr++ = sinf(*aVectorPtr++);
891 }
892 }
893
894 #endif /* LV_HAVE_NEON */
895
896
897 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
898