GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_cos_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 356 453 78.6%
Functions: 8 10 80.0%
Branches: 45 58 77.6%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_cos_32f
12 *
13 * \b Overview
14 *
15 * Computes cosine of the input vector and stores results in the output vector.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32f_cos_32f(float* bVector, const float* aVector, unsigned int num_points)
20 * \endcode
21 *
22 * \b Inputs
23 * \li aVector: The input vector of floats.
24 * \li num_points: The number of data points.
25 *
26 * \b Outputs
27 * \li bVector: The vector where results will be stored.
28 *
29 * \b Example
30 * Calculate cos(theta) for common angles.
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 * in[0] = 0.000;
38 * in[1] = 0.524;
39 * in[2] = 0.786;
40 * in[3] = 1.047;
41 * in[4] = 1.571;
42 * in[5] = 1.571;
43 * in[6] = 2.094;
44 * in[7] = 2.356;
45 * in[8] = 2.618;
46 * in[9] = 3.142;
47 *
48 * volk_32f_cos_32f(out, in, N);
49 *
50 * for(unsigned int ii = 0; ii < N; ++ii){
51 * printf("cos(%1.3f) = %1.3f\n", in[ii], out[ii]);
52 * }
53 *
54 * volk_free(in);
55 * volk_free(out);
56 * \endcode
57 */
58
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62
63 #ifndef INCLUDED_volk_32f_cos_32f_a_H
64 #define INCLUDED_volk_32f_cos_32f_a_H
65
66 #ifdef LV_HAVE_AVX512F
67
68 #include <immintrin.h>
69 static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
70 const float* inVector,
71 unsigned int num_points)
72 {
73 float* cosPtr = cosVector;
74 const float* inPtr = inVector;
75
76 unsigned int number = 0;
77 unsigned int sixteenPoints = num_points / 16;
78 unsigned int i = 0;
79
80 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
81 fones, sine, cosine;
82 __m512i q, zeros, ones, twos, fours;
83
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
95
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2;
102
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
105 // s = fabs(aVal)
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107
108 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110 // r = q + q&1, q indicates quadrant, r gives
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
116
117 s = _mm512_div_ps(
118 s,
119 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120 s = _mm512_mul_ps(s, s);
121 // Evaluate Taylor series
122 s = _mm512_mul_ps(
123 _mm512_fmadd_ps(
124 _mm512_fmsub_ps(
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126 s,
127 cp1),
128 s);
129
130 for (i = 0; i < 3; i++)
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 s = _mm512_div_ps(s, ftwos);
133
134 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135 cosine = _mm512_sub_ps(fones, s);
136
137 // if(((q+1)&2) != 0) { cosine=sine;}
138 condition1 = _mm512_cmpneq_epi32_mask(
139 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
140
141 // if(((q+2)&4) != 0) { cosine = -cosine;}
142 condition2 = _mm512_cmpneq_epi32_mask(
143 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
144 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
145 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
146 _mm512_store_ps(cosPtr, cosine);
147 inPtr += 16;
148 cosPtr += 16;
149 }
150
151 number = sixteenPoints * 16;
152 for (; number < num_points; number++) {
153 *cosPtr++ = cosf(*inPtr++);
154 }
155 }
156 #endif
157
158 #if LV_HAVE_AVX2 && LV_HAVE_FMA
159 #include <immintrin.h>
160
161 static inline void
162 2 volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
163 {
164 2 float* bPtr = bVector;
165 2 const float* aPtr = aVector;
166
167 2 unsigned int number = 0;
168 2 unsigned int eighthPoints = num_points / 8;
169 2 unsigned int i = 0;
170
171 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
172 fones, fzeroes;
173 __m256 sine, cosine;
174 __m256i q, ones, twos, fours;
175
176 2 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
177 2 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
178 2 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
179 2 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
180 2 ffours = _mm256_set1_ps(4.0);
181 2 ftwos = _mm256_set1_ps(2.0);
182 2 fones = _mm256_set1_ps(1.0);
183 2 fzeroes = _mm256_setzero_ps();
184 2 __m256i zeroes = _mm256_set1_epi32(0);
185 2 ones = _mm256_set1_epi32(1);
186 2 __m256i allones = _mm256_set1_epi32(0xffffffff);
187 2 twos = _mm256_set1_epi32(2);
188 2 fours = _mm256_set1_epi32(4);
189
190 2 cp1 = _mm256_set1_ps(1.0);
191 2 cp2 = _mm256_set1_ps(0.08333333333333333);
192 2 cp3 = _mm256_set1_ps(0.002777777777777778);
193 2 cp4 = _mm256_set1_ps(4.96031746031746e-05);
194 2 cp5 = _mm256_set1_ps(5.511463844797178e-07);
195 union bit256 condition1;
196 union bit256 condition3;
197
198
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
199
200 32766 aVal = _mm256_load_ps(aPtr);
201 // s = fabs(aVal)
202 98298 s = _mm256_sub_ps(aVal,
203 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
204 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
205 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
206 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
207 // r = q + q&1, q indicates quadrant, r gives
208 98298 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
209
210 32766 s = _mm256_fnmadd_ps(r, pio4A, s);
211 32766 s = _mm256_fnmadd_ps(r, pio4B, s);
212 32766 s = _mm256_fnmadd_ps(r, pio4C, s);
213
214 65532 s = _mm256_div_ps(
215 s,
216 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
217 32766 s = _mm256_mul_ps(s, s);
218 // Evaluate Taylor series
219 131064 s = _mm256_mul_ps(
220 _mm256_fmadd_ps(
221 _mm256_fmsub_ps(
222 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
223 s,
224 cp1),
225 s);
226
227
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++)
228 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
229 32766 s = _mm256_div_ps(s, ftwos);
230
231 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
232 32766 cosine = _mm256_sub_ps(fones, s);
233
234 // if(((q+1)&2) != 0) { cosine=sine;}
235 32766 condition1.int_vec =
236 65532 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
237 65532 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
238
239 // if(((q+2)&4) != 0) { cosine = -cosine;}
240 65532 condition3.int_vec = _mm256_cmpeq_epi32(
241 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
242 32766 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
243
244 98298 cosine = _mm256_add_ps(
245 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
246 163830 cosine = _mm256_sub_ps(cosine,
247 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
248 condition3.float_vec));
249 _mm256_store_ps(bPtr, cosine);
250 32766 aPtr += 8;
251 32766 bPtr += 8;
252 }
253
254 2 number = eighthPoints * 8;
255
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
256 14 *bPtr++ = cos(*aPtr++);
257 }
258 2 }
259
260 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
261
262 #ifdef LV_HAVE_AVX2
263 #include <immintrin.h>
264
265 static inline void
266 2 volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
267 {
268 2 float* bPtr = bVector;
269 2 const float* aPtr = aVector;
270
271 2 unsigned int number = 0;
272 2 unsigned int eighthPoints = num_points / 8;
273 2 unsigned int i = 0;
274
275 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
276 fones, fzeroes;
277 __m256 sine, cosine;
278 __m256i q, ones, twos, fours;
279
280 2 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
281 2 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
282 2 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
283 2 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
284 2 ffours = _mm256_set1_ps(4.0);
285 2 ftwos = _mm256_set1_ps(2.0);
286 2 fones = _mm256_set1_ps(1.0);
287 2 fzeroes = _mm256_setzero_ps();
288 2 __m256i zeroes = _mm256_set1_epi32(0);
289 2 ones = _mm256_set1_epi32(1);
290 2 __m256i allones = _mm256_set1_epi32(0xffffffff);
291 2 twos = _mm256_set1_epi32(2);
292 2 fours = _mm256_set1_epi32(4);
293
294 2 cp1 = _mm256_set1_ps(1.0);
295 2 cp2 = _mm256_set1_ps(0.08333333333333333);
296 2 cp3 = _mm256_set1_ps(0.002777777777777778);
297 2 cp4 = _mm256_set1_ps(4.96031746031746e-05);
298 2 cp5 = _mm256_set1_ps(5.511463844797178e-07);
299 union bit256 condition1;
300 union bit256 condition3;
301
302
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
303
304 32766 aVal = _mm256_load_ps(aPtr);
305 // s = fabs(aVal)
306 98298 s = _mm256_sub_ps(aVal,
307 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
308 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
309 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
310 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
311 // r = q + q&1, q indicates quadrant, r gives
312 98298 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
313
314 65532 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
315 65532 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
316 65532 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
317
318 65532 s = _mm256_div_ps(
319 s,
320 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
321 32766 s = _mm256_mul_ps(s, s);
322 // Evaluate Taylor series
323 262128 s = _mm256_mul_ps(
324 _mm256_add_ps(
325 _mm256_mul_ps(
326 _mm256_sub_ps(
327 _mm256_mul_ps(
328 _mm256_add_ps(
329 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
330 s),
331 cp3),
332 s),
333 cp2),
334 s),
335 cp1),
336 s);
337
338
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++)
339 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
340 32766 s = _mm256_div_ps(s, ftwos);
341
342 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
343 32766 cosine = _mm256_sub_ps(fones, s);
344
345 // if(((q+1)&2) != 0) { cosine=sine;}
346 32766 condition1.int_vec =
347 65532 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
348 65532 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
349
350 // if(((q+2)&4) != 0) { cosine = -cosine;}
351 65532 condition3.int_vec = _mm256_cmpeq_epi32(
352 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
353 32766 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
354
355 98298 cosine = _mm256_add_ps(
356 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
357 163830 cosine = _mm256_sub_ps(cosine,
358 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
359 condition3.float_vec));
360 _mm256_store_ps(bPtr, cosine);
361 32766 aPtr += 8;
362 32766 bPtr += 8;
363 }
364
365 2 number = eighthPoints * 8;
366
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
367 14 *bPtr++ = cos(*aPtr++);
368 }
369 2 }
370
371 #endif /* LV_HAVE_AVX2 for aligned */
372
373 #ifdef LV_HAVE_SSE4_1
374 #include <smmintrin.h>
375
376 static inline void
377 2 volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
378 {
379 2 float* bPtr = bVector;
380 2 const float* aPtr = aVector;
381
382 2 unsigned int number = 0;
383 2 unsigned int quarterPoints = num_points / 4;
384 2 unsigned int i = 0;
385
386 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
387 fones, fzeroes;
388 __m128 sine, cosine;
389 __m128i q, ones, twos, fours;
390
391 2 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
392 2 pio4A = _mm_set1_ps(0.7853981554508209228515625);
393 2 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
394 2 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
395 2 ffours = _mm_set1_ps(4.0);
396 2 ftwos = _mm_set1_ps(2.0);
397 2 fones = _mm_set1_ps(1.0);
398 2 fzeroes = _mm_setzero_ps();
399 2 __m128i zeroes = _mm_set1_epi32(0);
400 2 ones = _mm_set1_epi32(1);
401 2 __m128i allones = _mm_set1_epi32(0xffffffff);
402 2 twos = _mm_set1_epi32(2);
403 2 fours = _mm_set1_epi32(4);
404
405 2 cp1 = _mm_set1_ps(1.0);
406 2 cp2 = _mm_set1_ps(0.08333333333333333);
407 2 cp3 = _mm_set1_ps(0.002777777777777778);
408 2 cp4 = _mm_set1_ps(4.96031746031746e-05);
409 2 cp5 = _mm_set1_ps(5.511463844797178e-07);
410 union bit128 condition1;
411 union bit128 condition3;
412
413
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
414
415 65534 aVal = _mm_load_ps(aPtr);
416 // s = fabs(aVal)
417 262136 s = _mm_sub_ps(aVal,
418 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
419 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
420 131068 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
421 // r = q + q&1, q indicates quadrant, r gives
422 196602 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
423
424 131068 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
425 131068 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
426 131068 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
427
428 131068 s = _mm_div_ps(
429 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
430 65534 s = _mm_mul_ps(s, s);
431 // Evaluate Taylor series
432 524272 s = _mm_mul_ps(
433 _mm_add_ps(
434 _mm_mul_ps(
435 _mm_sub_ps(
436 _mm_mul_ps(
437 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
438 cp3),
439 s),
440 cp2),
441 s),
442 cp1),
443 s);
444
445
2/2
✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
262136 for (i = 0; i < 3; i++)
446 393204 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
447 65534 s = _mm_div_ps(s, ftwos);
448
449 196602 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
450 65534 cosine = _mm_sub_ps(fones, s);
451
452 // if(((q+1)&2) != 0) { cosine=sine;}
453 65534 condition1.int_vec =
454 131068 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
455 131068 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
456
457 // if(((q+2)&4) != 0) { cosine = -cosine;}
458 65534 condition3.int_vec =
459 131068 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
460 65534 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
461
462 196602 cosine = _mm_add_ps(cosine,
463 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
464 327670 cosine = _mm_sub_ps(
465 cosine,
466 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
467 _mm_store_ps(bPtr, cosine);
468 65534 aPtr += 4;
469 65534 bPtr += 4;
470 }
471
472 2 number = quarterPoints * 4;
473
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
474 6 *bPtr++ = cosf(*aPtr++);
475 }
476 2 }
477
478 #endif /* LV_HAVE_SSE4_1 for aligned */
479
480 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
481
482
483 #ifndef INCLUDED_volk_32f_cos_32f_u_H
484 #define INCLUDED_volk_32f_cos_32f_u_H
485
486 #ifdef LV_HAVE_AVX512F
487
488 #include <immintrin.h>
489 static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
490 const float* inVector,
491 unsigned int num_points)
492 {
493 float* cosPtr = cosVector;
494 const float* inPtr = inVector;
495
496 unsigned int number = 0;
497 unsigned int sixteenPoints = num_points / 16;
498 unsigned int i = 0;
499
500 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
501 fones, sine, cosine;
502 __m512i q, zeros, ones, twos, fours;
503
504 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
505 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
506 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
507 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
508 ffours = _mm512_set1_ps(4.0);
509 ftwos = _mm512_set1_ps(2.0);
510 fones = _mm512_set1_ps(1.0);
511 zeros = _mm512_setzero_epi32();
512 ones = _mm512_set1_epi32(1);
513 twos = _mm512_set1_epi32(2);
514 fours = _mm512_set1_epi32(4);
515
516 cp1 = _mm512_set1_ps(1.0);
517 cp2 = _mm512_set1_ps(0.08333333333333333);
518 cp3 = _mm512_set1_ps(0.002777777777777778);
519 cp4 = _mm512_set1_ps(4.96031746031746e-05);
520 cp5 = _mm512_set1_ps(5.511463844797178e-07);
521 __mmask16 condition1, condition2;
522 for (; number < sixteenPoints; number++) {
523 aVal = _mm512_loadu_ps(inPtr);
524 // s = fabs(aVal)
525 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
526
527 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
528 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
529 // r = q + q&1, q indicates quadrant, r gives
530 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
531
532 s = _mm512_fnmadd_ps(r, pio4A, s);
533 s = _mm512_fnmadd_ps(r, pio4B, s);
534 s = _mm512_fnmadd_ps(r, pio4C, s);
535
536 s = _mm512_div_ps(
537 s,
538 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
539 s = _mm512_mul_ps(s, s);
540 // Evaluate Taylor series
541 s = _mm512_mul_ps(
542 _mm512_fmadd_ps(
543 _mm512_fmsub_ps(
544 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
545 s,
546 cp1),
547 s);
548
549 for (i = 0; i < 3; i++)
550 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
551 s = _mm512_div_ps(s, ftwos);
552
553 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
554 cosine = _mm512_sub_ps(fones, s);
555
556 // if(((q+1)&2) != 0) { cosine=sine;}
557 condition1 = _mm512_cmpneq_epi32_mask(
558 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
559
560 // if(((q+2)&4) != 0) { cosine = -cosine;}
561 condition2 = _mm512_cmpneq_epi32_mask(
562 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
563
564 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
565 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
566 _mm512_storeu_ps(cosPtr, cosine);
567 inPtr += 16;
568 cosPtr += 16;
569 }
570
571 number = sixteenPoints * 16;
572 for (; number < num_points; number++) {
573 *cosPtr++ = cosf(*inPtr++);
574 }
575 }
576 #endif
577
578 #if LV_HAVE_AVX2 && LV_HAVE_FMA
579 #include <immintrin.h>
580
581 static inline void
582 2 volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
583 {
584 2 float* bPtr = bVector;
585 2 const float* aPtr = aVector;
586
587 2 unsigned int number = 0;
588 2 unsigned int eighthPoints = num_points / 8;
589 2 unsigned int i = 0;
590
591 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
592 fones, fzeroes;
593 __m256 sine, cosine;
594 __m256i q, ones, twos, fours;
595
596 2 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
597 2 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
598 2 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
599 2 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
600 2 ffours = _mm256_set1_ps(4.0);
601 2 ftwos = _mm256_set1_ps(2.0);
602 2 fones = _mm256_set1_ps(1.0);
603 2 fzeroes = _mm256_setzero_ps();
604 2 __m256i zeroes = _mm256_set1_epi32(0);
605 2 ones = _mm256_set1_epi32(1);
606 2 __m256i allones = _mm256_set1_epi32(0xffffffff);
607 2 twos = _mm256_set1_epi32(2);
608 2 fours = _mm256_set1_epi32(4);
609
610 2 cp1 = _mm256_set1_ps(1.0);
611 2 cp2 = _mm256_set1_ps(0.08333333333333333);
612 2 cp3 = _mm256_set1_ps(0.002777777777777778);
613 2 cp4 = _mm256_set1_ps(4.96031746031746e-05);
614 2 cp5 = _mm256_set1_ps(5.511463844797178e-07);
615 union bit256 condition1;
616 union bit256 condition3;
617
618
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
619
620 32766 aVal = _mm256_loadu_ps(aPtr);
621 // s = fabs(aVal)
622 98298 s = _mm256_sub_ps(aVal,
623 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
624 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
625 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
626 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
627 // r = q + q&1, q indicates quadrant, r gives
628 98298 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
629
630 32766 s = _mm256_fnmadd_ps(r, pio4A, s);
631 32766 s = _mm256_fnmadd_ps(r, pio4B, s);
632 32766 s = _mm256_fnmadd_ps(r, pio4C, s);
633
634 65532 s = _mm256_div_ps(
635 s,
636 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
637 32766 s = _mm256_mul_ps(s, s);
638 // Evaluate Taylor series
639 131064 s = _mm256_mul_ps(
640 _mm256_fmadd_ps(
641 _mm256_fmsub_ps(
642 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
643 s,
644 cp1),
645 s);
646
647
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++)
648 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
649 32766 s = _mm256_div_ps(s, ftwos);
650
651 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
652 32766 cosine = _mm256_sub_ps(fones, s);
653
654 // if(((q+1)&2) != 0) { cosine=sine;}
655 32766 condition1.int_vec =
656 65532 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
657 65532 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
658
659 // if(((q+2)&4) != 0) { cosine = -cosine;}
660 65532 condition3.int_vec = _mm256_cmpeq_epi32(
661 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
662 32766 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
663
664 98298 cosine = _mm256_add_ps(
665 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
666 163830 cosine = _mm256_sub_ps(cosine,
667 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
668 condition3.float_vec));
669 _mm256_storeu_ps(bPtr, cosine);
670 32766 aPtr += 8;
671 32766 bPtr += 8;
672 }
673
674 2 number = eighthPoints * 8;
675
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
676 14 *bPtr++ = cos(*aPtr++);
677 }
678 2 }
679
680 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
681
682 #ifdef LV_HAVE_AVX2
683 #include <immintrin.h>
684
685 static inline void
686 2 volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
687 {
688 2 float* bPtr = bVector;
689 2 const float* aPtr = aVector;
690
691 2 unsigned int number = 0;
692 2 unsigned int eighthPoints = num_points / 8;
693 2 unsigned int i = 0;
694
695 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
696 fones, fzeroes;
697 __m256 sine, cosine;
698 __m256i q, ones, twos, fours;
699
700 2 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
701 2 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
702 2 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
703 2 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
704 2 ffours = _mm256_set1_ps(4.0);
705 2 ftwos = _mm256_set1_ps(2.0);
706 2 fones = _mm256_set1_ps(1.0);
707 2 fzeroes = _mm256_setzero_ps();
708 2 __m256i zeroes = _mm256_set1_epi32(0);
709 2 ones = _mm256_set1_epi32(1);
710 2 __m256i allones = _mm256_set1_epi32(0xffffffff);
711 2 twos = _mm256_set1_epi32(2);
712 2 fours = _mm256_set1_epi32(4);
713
714 2 cp1 = _mm256_set1_ps(1.0);
715 2 cp2 = _mm256_set1_ps(0.08333333333333333);
716 2 cp3 = _mm256_set1_ps(0.002777777777777778);
717 2 cp4 = _mm256_set1_ps(4.96031746031746e-05);
718 2 cp5 = _mm256_set1_ps(5.511463844797178e-07);
719 union bit256 condition1;
720 union bit256 condition3;
721
722
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
723
724 32766 aVal = _mm256_loadu_ps(aPtr);
725 // s = fabs(aVal)
726 98298 s = _mm256_sub_ps(aVal,
727 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
728 32766 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
729 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
730 65532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
731 // r = q + q&1, q indicates quadrant, r gives
732 98298 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
733
734 65532 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
735 65532 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
736 65532 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
737
738 65532 s = _mm256_div_ps(
739 s,
740 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
741 32766 s = _mm256_mul_ps(s, s);
742 // Evaluate Taylor series
743 262128 s = _mm256_mul_ps(
744 _mm256_add_ps(
745 _mm256_mul_ps(
746 _mm256_sub_ps(
747 _mm256_mul_ps(
748 _mm256_add_ps(
749 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
750 s),
751 cp3),
752 s),
753 cp2),
754 s),
755 cp1),
756 s);
757
758
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
131064 for (i = 0; i < 3; i++)
759 196596 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
760 32766 s = _mm256_div_ps(s, ftwos);
761
762 98298 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
763 32766 cosine = _mm256_sub_ps(fones, s);
764
765 // if(((q+1)&2) != 0) { cosine=sine;}
766 32766 condition1.int_vec =
767 65532 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
768 65532 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
769
770 // if(((q+2)&4) != 0) { cosine = -cosine;}
771 65532 condition3.int_vec = _mm256_cmpeq_epi32(
772 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
773 32766 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
774
775 98298 cosine = _mm256_add_ps(
776 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
777 163830 cosine = _mm256_sub_ps(cosine,
778 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
779 condition3.float_vec));
780 _mm256_storeu_ps(bPtr, cosine);
781 32766 aPtr += 8;
782 32766 bPtr += 8;
783 }
784
785 2 number = eighthPoints * 8;
786
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
787 14 *bPtr++ = cos(*aPtr++);
788 }
789 2 }
790
791 #endif /* LV_HAVE_AVX2 for unaligned */
792
793 #ifdef LV_HAVE_SSE4_1
794 #include <smmintrin.h>
795
796 static inline void
797 2 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
798 {
799 2 float* bPtr = bVector;
800 2 const float* aPtr = aVector;
801
802 2 unsigned int number = 0;
803 2 unsigned int quarterPoints = num_points / 4;
804 2 unsigned int i = 0;
805
806 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
807 fzeroes;
808 __m128 sine, cosine, condition1, condition3;
809 __m128i q, r, ones, twos, fours;
810
811 2 m4pi = _mm_set1_ps(1.273239545);
812 2 pio4A = _mm_set1_ps(0.78515625);
813 2 pio4B = _mm_set1_ps(0.241876e-3);
814 2 ffours = _mm_set1_ps(4.0);
815 2 ftwos = _mm_set1_ps(2.0);
816 2 fones = _mm_set1_ps(1.0);
817 2 fzeroes = _mm_setzero_ps();
818 2 ones = _mm_set1_epi32(1);
819 2 twos = _mm_set1_epi32(2);
820 2 fours = _mm_set1_epi32(4);
821
822 2 cp1 = _mm_set1_ps(1.0);
823 2 cp2 = _mm_set1_ps(0.83333333e-1);
824 2 cp3 = _mm_set1_ps(0.2777778e-2);
825 2 cp4 = _mm_set1_ps(0.49603e-4);
826 2 cp5 = _mm_set1_ps(0.551e-6);
827
828
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
829 65534 aVal = _mm_loadu_ps(aPtr);
830 262136 s = _mm_sub_ps(aVal,
831 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
832 131068 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
833 131068 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
834
835 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
836 196602 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
837
838 131068 s = _mm_div_ps(
839 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
840 65534 s = _mm_mul_ps(s, s);
841 // Evaluate Taylor series
842 524272 s = _mm_mul_ps(
843 _mm_add_ps(
844 _mm_mul_ps(
845 _mm_sub_ps(
846 _mm_mul_ps(
847 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
848 cp3),
849 s),
850 cp2),
851 s),
852 cp1),
853 s);
854
855
2/2
✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
262136 for (i = 0; i < 3; i++) {
856 393204 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
857 }
858 65534 s = _mm_div_ps(s, ftwos);
859
860 196602 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
861 65534 cosine = _mm_sub_ps(fones, s);
862
863 262136 condition1 = _mm_cmpneq_ps(
864 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
865
866 262136 condition3 = _mm_cmpneq_ps(
867 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
868
869 196602 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
870 262136 cosine = _mm_sub_ps(
871 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
872 _mm_storeu_ps(bPtr, cosine);
873 65534 aPtr += 4;
874 65534 bPtr += 4;
875 }
876
877 2 number = quarterPoints * 4;
878
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
879 6 *bPtr++ = cosf(*aPtr++);
880 }
881 2 }
882
883 #endif /* LV_HAVE_SSE4_1 for unaligned */
884
885
886 #ifdef LV_HAVE_GENERIC
887
888 /*
889 * For derivation see
890 * Shibata, Naoki, "Efficient evaluation methods of elementary functions
891 * suitable for SIMD computation," in Springer-Verlag 2010
892 */
893 2 static inline void volk_32f_cos_32f_generic_fast(float* bVector,
894 const float* aVector,
895 unsigned int num_points)
896 {
897 2 float* bPtr = bVector;
898 2 const float* aPtr = aVector;
899
900 2 float m4pi = 1.273239544735162542821171882678754627704620361328125;
901 2 float pio4A = 0.7853981554508209228515625;
902 2 float pio4B = 0.794662735614792836713604629039764404296875e-8;
903 2 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
904 2 int N = 3; // order of argument reduction
905
906 unsigned int number;
907
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
908 262142 float s = fabs(*aPtr);
909 262142 int q = (int)(s * m4pi);
910 262142 int r = q + (q & 1);
911 262142 s -= r * pio4A;
912 262142 s -= r * pio4B;
913 262142 s -= r * pio4C;
914
915 262142 s = s * 0.125; // 2^-N (<--3)
916 262142 s = s * s;
917 262142 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
918 262142 1.0) *
919 s;
920
921 int i;
922
2/2
✓ Branch 0 taken 786426 times.
✓ Branch 1 taken 262142 times.
1048568 for (i = 0; i < N; ++i) {
923 786426 s = (4.0 - s) * s;
924 }
925 262142 s = s / 2.0;
926
927 262142 float sine = sqrt((2.0 - s) * s);
928 262142 float cosine = 1 - s;
929
930
2/2
✓ Branch 0 taken 56035 times.
✓ Branch 1 taken 206107 times.
262142 if (((q + 1) & 2) != 0) {
931 56035 s = cosine;
932 56035 cosine = sine;
933 56035 sine = s;
934 }
935
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.
262142 if (((q + 2) & 4) != 0) {
936 cosine = -cosine;
937 }
938 262142 *bPtr = cosine;
939 262142 bPtr++;
940 262142 aPtr++;
941 }
942 2 }
943
944 #endif /* LV_HAVE_GENERIC */
945
946
947 #ifdef LV_HAVE_GENERIC
948
949 static inline void
950 2 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
951 {
952 2 float* bPtr = bVector;
953 2 const float* aPtr = aVector;
954 2 unsigned int number = 0;
955
956
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; number < num_points; number++) {
957 262142 *bPtr++ = cosf(*aPtr++);
958 }
959 2 }
960
961 #endif /* LV_HAVE_GENERIC */
962
963
964 #ifdef LV_HAVE_NEON
965 #include <arm_neon.h>
966 #include <volk/volk_neon_intrinsics.h>
967
968 static inline void
969 volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
970 {
971 unsigned int number = 0;
972 unsigned int quarter_points = num_points / 4;
973 float* bVectorPtr = bVector;
974 const float* aVectorPtr = aVector;
975
976 float32x4_t b_vec;
977 float32x4_t a_vec;
978
979 for (number = 0; number < quarter_points; number++) {
980 a_vec = vld1q_f32(aVectorPtr);
981 // Prefetch next one, speeds things up
982 __VOLK_PREFETCH(aVectorPtr + 4);
983 b_vec = _vcosq_f32(a_vec);
984 vst1q_f32(bVectorPtr, b_vec);
985 // move pointers ahead
986 bVectorPtr += 4;
987 aVectorPtr += 4;
988 }
989
990 // Deal with the rest
991 for (number = quarter_points * 4; number < num_points; number++) {
992 *bVectorPtr++ = cosf(*aVectorPtr++);
993 }
994 }
995
996 #endif /* LV_HAVE_NEON */
997
998
999 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
1000