GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_acos_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 253 253 100.0%
Functions: 7 7 100.0%
Branches: 50 50 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_acos_32f
12 *
13 * \b Overview
14 *
15 * Computes arccosine of the input vector and stores results in the output vector.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32f_acos_32f(float* bVector, const float* aVector, unsigned int num_points)
20 * \endcode
21 *
22 * \b Inputs
23 * \li aVector: The input vector of floats.
24 * \li num_points: The number of data points.
25 *
26 * \b Outputs
27 * \li bVector: The vector where results will be stored.
28 *
29 * \b Example
30 * Calculate common angles around the top half of the unit circle.
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 * in[0] = 1;
38 * in[1] = std::sqrt(3.f)/2.f;
39 * in[2] = std::sqrt(2.f)/2.f;
40 * in[3] = 0.5;
41 * in[4] = in[5] = 0;
42 * for(unsigned int ii = 6; ii < N; ++ii){
43 * in[ii] = - in[N-ii-1];
44 * }
45 *
46 * volk_32f_acos_32f(out, in, N);
47 *
48 * for(unsigned int ii = 0; ii < N; ++ii){
49 * printf("acos(%1.3f) = %1.3f\n", in[ii], out[ii]);
50 * }
51 *
52 * volk_free(in);
53 * volk_free(out);
54 * \endcode
55 */
56
57 #include <inttypes.h>
58 #include <math.h>
59 #include <stdio.h>
60
61 /* This is the number of terms of Taylor series to evaluate, increase this for more
62 * accuracy*/
63 #define ACOS_TERMS 2
64
65 #ifndef INCLUDED_volk_32f_acos_32f_a_H
66 #define INCLUDED_volk_32f_acos_32f_a_H
67
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70
71 2 static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
72 const float* aVector,
73 unsigned int num_points)
74 {
75 2 float* bPtr = bVector;
76 2 const float* aPtr = aVector;
77
78 2 unsigned int number = 0;
79 2 unsigned int eighthPoints = num_points / 8;
80 int i, j;
81
82 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
84
85 2 pi = _mm256_set1_ps(3.14159265358979323846);
86 2 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87 2 fzeroes = _mm256_setzero_ps();
88 2 fones = _mm256_set1_ps(1.0);
89 2 ftwos = _mm256_set1_ps(2.0);
90 2 ffours = _mm256_set1_ps(4.0);
91
92
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
93 32766 aVal = _mm256_load_ps(aPtr);
94 32766 d = aVal;
95 131064 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96 _mm256_sub_ps(fones, aVal))),
97 aVal);
98 32766 z = aVal;
99 32766 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100 65532 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101 32766 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
102 98298 x = _mm256_add_ps(
103 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
104
105
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (i = 0; i < 2; i++)
106 196596 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107 32766 x = _mm256_div_ps(fones, x);
108 32766 y = fzeroes;
109
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (j = ACOS_TERMS - 1; j >= 0; j--)
110 196596 y = _mm256_fmadd_ps(
111 65532 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
112
113 32766 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 32766 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115
116 65532 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117 32766 arccosine = y;
118 32766 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 65532 arccosine = _mm256_sub_ps(
120 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
121 32766 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
122 65532 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
123
124 _mm256_store_ps(bPtr, arccosine);
125 32766 aPtr += 8;
126 32766 bPtr += 8;
127 }
128
129 2 number = eighthPoints * 8;
130
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
131 14 *bPtr++ = acos(*aPtr++);
132 }
133 2 }
134
135 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
136
137
138 #ifdef LV_HAVE_AVX
139 #include <immintrin.h>
140
141 static inline void
142 2 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
143 {
144 2 float* bPtr = bVector;
145 2 const float* aPtr = aVector;
146
147 2 unsigned int number = 0;
148 2 unsigned int eighthPoints = num_points / 8;
149 int i, j;
150
151 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
152 __m256 fzeroes, fones, ftwos, ffours, condition;
153
154 2 pi = _mm256_set1_ps(3.14159265358979323846);
155 2 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
156 2 fzeroes = _mm256_setzero_ps();
157 2 fones = _mm256_set1_ps(1.0);
158 2 ftwos = _mm256_set1_ps(2.0);
159 2 ffours = _mm256_set1_ps(4.0);
160
161
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
162 32766 aVal = _mm256_load_ps(aPtr);
163 32766 d = aVal;
164 131064 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
165 _mm256_sub_ps(fones, aVal))),
166 aVal);
167 32766 z = aVal;
168 32766 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169 65532 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170 32766 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
171 98298 x = _mm256_add_ps(
172 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
173
174
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (i = 0; i < 2; i++)
175 262128 x = _mm256_add_ps(x,
176 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177 32766 x = _mm256_div_ps(fones, x);
178 32766 y = fzeroes;
179
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (j = ACOS_TERMS - 1; j >= 0; j--)
180 262128 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
181 65532 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
182
183 32766 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
184 32766 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185
186 98298 y = _mm256_add_ps(
187 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
188 32766 arccosine = y;
189 32766 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190 65532 arccosine = _mm256_sub_ps(
191 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
192 32766 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
193 65532 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
194
195 _mm256_store_ps(bPtr, arccosine);
196 32766 aPtr += 8;
197 32766 bPtr += 8;
198 }
199
200 2 number = eighthPoints * 8;
201
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
202 14 *bPtr++ = acos(*aPtr++);
203 }
204 2 }
205
206 #endif /* LV_HAVE_AVX2 for aligned */
207
208 #ifdef LV_HAVE_SSE4_1
209 #include <smmintrin.h>
210
211 static inline void
212 2 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
213 {
214 2 float* bPtr = bVector;
215 2 const float* aPtr = aVector;
216
217 2 unsigned int number = 0;
218 2 unsigned int quarterPoints = num_points / 4;
219 int i, j;
220
221 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
222 __m128 fzeroes, fones, ftwos, ffours, condition;
223
224 2 pi = _mm_set1_ps(3.14159265358979323846);
225 2 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
226 2 fzeroes = _mm_setzero_ps();
227 2 fones = _mm_set1_ps(1.0);
228 2 ftwos = _mm_set1_ps(2.0);
229 2 ffours = _mm_set1_ps(4.0);
230
231
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
232 65534 aVal = _mm_load_ps(aPtr);
233 65534 d = aVal;
234 262136 aVal = _mm_div_ps(
235 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
236 aVal);
237 65534 z = aVal;
238 65534 condition = _mm_cmplt_ps(z, fzeroes);
239 196602 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
240 65534 condition = _mm_cmplt_ps(z, fones);
241 196602 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
242
243
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.
196602 for (i = 0; i < 2; i++)
244 524272 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
245 65534 x = _mm_div_ps(fones, x);
246 65534 y = fzeroes;
247
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.
196602 for (j = ACOS_TERMS - 1; j >= 0; j--)
248 524272 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
249 131068 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
250
251 131068 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252 65534 condition = _mm_cmpgt_ps(z, fones);
253
254 196602 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255 65534 arccosine = y;
256 65534 condition = _mm_cmplt_ps(aVal, fzeroes);
257 arccosine =
258 196602 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
259 65534 condition = _mm_cmplt_ps(d, fzeroes);
260 131068 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
261
262 _mm_store_ps(bPtr, arccosine);
263 65534 aPtr += 4;
264 65534 bPtr += 4;
265 }
266
267 2 number = quarterPoints * 4;
268
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
269 6 *bPtr++ = acosf(*aPtr++);
270 }
271 2 }
272
273 #endif /* LV_HAVE_SSE4_1 for aligned */
274
275 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
276
277
278 #ifndef INCLUDED_volk_32f_acos_32f_u_H
279 #define INCLUDED_volk_32f_acos_32f_u_H
280
281 #if LV_HAVE_AVX2 && LV_HAVE_FMA
282 #include <immintrin.h>
283
284 2 static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
285 const float* aVector,
286 unsigned int num_points)
287 {
288 2 float* bPtr = bVector;
289 2 const float* aPtr = aVector;
290
291 2 unsigned int number = 0;
292 2 unsigned int eighthPoints = num_points / 8;
293 int i, j;
294
295 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
296 __m256 fzeroes, fones, ftwos, ffours, condition;
297
298 2 pi = _mm256_set1_ps(3.14159265358979323846);
299 2 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
300 2 fzeroes = _mm256_setzero_ps();
301 2 fones = _mm256_set1_ps(1.0);
302 2 ftwos = _mm256_set1_ps(2.0);
303 2 ffours = _mm256_set1_ps(4.0);
304
305
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
306 32766 aVal = _mm256_loadu_ps(aPtr);
307 32766 d = aVal;
308 131064 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
309 _mm256_sub_ps(fones, aVal))),
310 aVal);
311 32766 z = aVal;
312 32766 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
313 65532 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
314 32766 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
315 98298 x = _mm256_add_ps(
316 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
317
318
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (i = 0; i < 2; i++)
319 196596 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
320 32766 x = _mm256_div_ps(fones, x);
321 32766 y = fzeroes;
322
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (j = ACOS_TERMS - 1; j >= 0; j--)
323 196596 y = _mm256_fmadd_ps(
324 65532 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
325
326 32766 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
327 32766 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
328
329 65532 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
330 32766 arccosine = y;
331 32766 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
332 65532 arccosine = _mm256_sub_ps(
333 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
334 32766 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
335 65532 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
336
337 _mm256_storeu_ps(bPtr, arccosine);
338 32766 aPtr += 8;
339 32766 bPtr += 8;
340 }
341
342 2 number = eighthPoints * 8;
343
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
344 14 *bPtr++ = acos(*aPtr++);
345 }
346 2 }
347
348 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
349
350
351 #ifdef LV_HAVE_AVX
352 #include <immintrin.h>
353
354 static inline void
355 2 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
356 {
357 2 float* bPtr = bVector;
358 2 const float* aPtr = aVector;
359
360 2 unsigned int number = 0;
361 2 unsigned int eighthPoints = num_points / 8;
362 int i, j;
363
364 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
365 __m256 fzeroes, fones, ftwos, ffours, condition;
366
367 2 pi = _mm256_set1_ps(3.14159265358979323846);
368 2 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
369 2 fzeroes = _mm256_setzero_ps();
370 2 fones = _mm256_set1_ps(1.0);
371 2 ftwos = _mm256_set1_ps(2.0);
372 2 ffours = _mm256_set1_ps(4.0);
373
374
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
375 32766 aVal = _mm256_loadu_ps(aPtr);
376 32766 d = aVal;
377 131064 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
378 _mm256_sub_ps(fones, aVal))),
379 aVal);
380 32766 z = aVal;
381 32766 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
382 65532 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
383 32766 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
384 98298 x = _mm256_add_ps(
385 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
386
387
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (i = 0; i < 2; i++)
388 262128 x = _mm256_add_ps(x,
389 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
390 32766 x = _mm256_div_ps(fones, x);
391 32766 y = fzeroes;
392
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 32766 times.
98298 for (j = ACOS_TERMS - 1; j >= 0; j--)
393 262128 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
394 65532 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
395
396 32766 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
397 32766 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
398
399 98298 y = _mm256_add_ps(
400 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
401 32766 arccosine = y;
402 32766 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
403 65532 arccosine = _mm256_sub_ps(
404 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
405 32766 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
406 65532 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
407
408 _mm256_storeu_ps(bPtr, arccosine);
409 32766 aPtr += 8;
410 32766 bPtr += 8;
411 }
412
413 2 number = eighthPoints * 8;
414
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
415 14 *bPtr++ = acos(*aPtr++);
416 }
417 2 }
418
419 #endif /* LV_HAVE_AVX2 for unaligned */
420
421 #ifdef LV_HAVE_SSE4_1
422 #include <smmintrin.h>
423
424 static inline void
425 2 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
426 {
427 2 float* bPtr = bVector;
428 2 const float* aPtr = aVector;
429
430 2 unsigned int number = 0;
431 2 unsigned int quarterPoints = num_points / 4;
432 int i, j;
433
434 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
435 __m128 fzeroes, fones, ftwos, ffours, condition;
436
437 2 pi = _mm_set1_ps(3.14159265358979323846);
438 2 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
439 2 fzeroes = _mm_setzero_ps();
440 2 fones = _mm_set1_ps(1.0);
441 2 ftwos = _mm_set1_ps(2.0);
442 2 ffours = _mm_set1_ps(4.0);
443
444
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
445 65534 aVal = _mm_loadu_ps(aPtr);
446 65534 d = aVal;
447 262136 aVal = _mm_div_ps(
448 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
449 aVal);
450 65534 z = aVal;
451 65534 condition = _mm_cmplt_ps(z, fzeroes);
452 196602 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453 65534 condition = _mm_cmplt_ps(z, fones);
454 196602 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
455
456
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.
196602 for (i = 0; i < 2; i++)
457 524272 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
458 65534 x = _mm_div_ps(fones, x);
459 65534 y = fzeroes;
460
461
2/2
✓ Branch 0 taken 131068 times.
✓ Branch 1 taken 65534 times.
196602 for (j = ACOS_TERMS - 1; j >= 0; j--)
462 524272 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463 131068 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
464
465 131068 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
466 65534 condition = _mm_cmpgt_ps(z, fones);
467
468 196602 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
469 65534 arccosine = y;
470 65534 condition = _mm_cmplt_ps(aVal, fzeroes);
471 arccosine =
472 196602 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
473 65534 condition = _mm_cmplt_ps(d, fzeroes);
474 131068 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
475
476 _mm_storeu_ps(bPtr, arccosine);
477 65534 aPtr += 4;
478 65534 bPtr += 4;
479 }
480
481 2 number = quarterPoints * 4;
482
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
483 6 *bPtr++ = acosf(*aPtr++);
484 }
485 2 }
486
487 #endif /* LV_HAVE_SSE4_1 for aligned */
488
489 #ifdef LV_HAVE_GENERIC
490
491 static inline void
492 2 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
493 {
494 2 float* bPtr = bVector;
495 2 const float* aPtr = aVector;
496 2 unsigned int number = 0;
497
498
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
499 262142 *bPtr++ = acosf(*aPtr++);
500 }
501 2 }
502 #endif /* LV_HAVE_GENERIC */
503
504 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
505