Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_sin_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the sine of the input vector and stores the results in the output vector. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_32f_sin_32f(float* bVector, const float* aVector, unsigned int num_points) | ||
20 | * \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li aVector: The input vector of floats. | ||
24 | * \li num_points: The number of data points. | ||
25 | * | ||
26 | * \b Outputs | ||
27 | * \li bVector: The output vector. | ||
28 | * | ||
29 | * \b Example | ||
30 | * Calculate sin(theta) for several common angles. | ||
31 | * \code | ||
32 | * int N = 10; | ||
33 | * unsigned int alignment = volk_get_alignment(); | ||
34 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
35 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | * | ||
37 | * in[0] = 0.000; | ||
38 | * in[1] = 0.524; | ||
39 | * in[2] = 0.786; | ||
40 | * in[3] = 1.047; | ||
41 | * in[4] = 1.571; | ||
42 | * in[5] = 1.571; | ||
43 | * in[6] = 2.094; | ||
44 | * in[7] = 2.356; | ||
45 | * in[8] = 2.618; | ||
46 | * in[9] = 3.142; | ||
47 | * | ||
48 | * volk_32f_sin_32f(out, in, N); | ||
49 | * | ||
50 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
51 | * printf("sin(%1.3f) = %1.3f\n", in[ii], out[ii]); | ||
52 | * } | ||
53 | * | ||
54 | * volk_free(in); | ||
55 | * volk_free(out); | ||
56 | * \endcode | ||
57 | */ | ||
58 | |||
59 | #include <inttypes.h> | ||
60 | #include <math.h> | ||
61 | #include <stdio.h> | ||
62 | |||
63 | #ifndef INCLUDED_volk_32f_sin_32f_a_H | ||
64 | #define INCLUDED_volk_32f_sin_32f_a_H | ||
65 | #ifdef LV_HAVE_AVX512F | ||
66 | |||
67 | #include <immintrin.h> | ||
68 | ✗ | static inline void volk_32f_sin_32f_a_avx512f(float* sinVector, | |
69 | const float* inVector, | ||
70 | unsigned int num_points) | ||
71 | { | ||
72 | ✗ | float* sinPtr = sinVector; | |
73 | ✗ | const float* inPtr = inVector; | |
74 | |||
75 | ✗ | unsigned int number = 0; | |
76 | ✗ | unsigned int sixteenPoints = num_points / 16; | |
77 | ✗ | unsigned int i = 0; | |
78 | |||
79 | __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, | ||
80 | fones; | ||
81 | __m512 sine, cosine; | ||
82 | __m512i q, zeros, ones, twos, fours; | ||
83 | |||
84 | ✗ | m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125); | |
85 | ✗ | pio4A = _mm512_set1_ps(0.7853981554508209228515625); | |
86 | ✗ | pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8); | |
87 | ✗ | pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); | |
88 | ✗ | ffours = _mm512_set1_ps(4.0); | |
89 | ✗ | ftwos = _mm512_set1_ps(2.0); | |
90 | ✗ | fones = _mm512_set1_ps(1.0); | |
91 | ✗ | zeros = _mm512_setzero_epi32(); | |
92 | ✗ | ones = _mm512_set1_epi32(1); | |
93 | ✗ | twos = _mm512_set1_epi32(2); | |
94 | ✗ | fours = _mm512_set1_epi32(4); | |
95 | |||
96 | ✗ | cp1 = _mm512_set1_ps(1.0); | |
97 | ✗ | cp2 = _mm512_set1_ps(0.08333333333333333); | |
98 | ✗ | cp3 = _mm512_set1_ps(0.002777777777777778); | |
99 | ✗ | cp4 = _mm512_set1_ps(4.96031746031746e-05); | |
100 | ✗ | cp5 = _mm512_set1_ps(5.511463844797178e-07); | |
101 | __mmask16 condition1, condition2, ltZero; | ||
102 | |||
103 | ✗ | for (; number < sixteenPoints; number++) { | |
104 | ✗ | aVal = _mm512_load_ps(inPtr); | |
105 | // s = fabs(aVal) | ||
106 | ✗ | s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff))); | |
107 | |||
108 | // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) | ||
109 | ✗ | q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi))); | |
110 | // r = q + q&1, q indicates quadrant, r gives | ||
111 | ✗ | r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones))); | |
112 | |||
113 | ✗ | s = _mm512_fnmadd_ps(r, pio4A, s); | |
114 | ✗ | s = _mm512_fnmadd_ps(r, pio4B, s); | |
115 | ✗ | s = _mm512_fnmadd_ps(r, pio4C, s); | |
116 | |||
117 | ✗ | s = _mm512_div_ps( | |
118 | s, | ||
119 | _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction | ||
120 | ✗ | s = _mm512_mul_ps(s, s); | |
121 | // Evaluate Taylor series | ||
122 | ✗ | s = _mm512_mul_ps( | |
123 | _mm512_fmadd_ps( | ||
124 | _mm512_fmsub_ps( | ||
125 | _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), | ||
126 | s, | ||
127 | cp1), | ||
128 | s); | ||
129 | |||
130 | ✗ | for (i = 0; i < 3; i++) | |
131 | ✗ | s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); | |
132 | ✗ | s = _mm512_div_ps(s, ftwos); | |
133 | |||
134 | ✗ | sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); | |
135 | ✗ | cosine = _mm512_sub_ps(fones, s); | |
136 | |||
137 | ✗ | condition1 = _mm512_cmpneq_epi32_mask( | |
138 | _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros); | ||
139 | ✗ | ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS); | |
140 | ✗ | condition2 = _mm512_kxor( | |
141 | ✗ | _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero); | |
142 | |||
143 | ✗ | sine = _mm512_mask_blend_ps(condition1, sine, cosine); | |
144 | ✗ | sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f)); | |
145 | _mm512_store_ps(sinPtr, sine); | ||
146 | ✗ | inPtr += 16; | |
147 | ✗ | sinPtr += 16; | |
148 | } | ||
149 | |||
150 | ✗ | number = sixteenPoints * 16; | |
151 | ✗ | for (; number < num_points; number++) { | |
152 | ✗ | *sinPtr++ = sinf(*inPtr++); | |
153 | } | ||
154 | ✗ | } | |
155 | #endif | ||
156 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
157 | #include <immintrin.h> | ||
158 | |||
159 | static inline void | ||
160 | 2 | volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) | |
161 | { | ||
162 | 2 | float* bPtr = bVector; | |
163 | 2 | const float* aPtr = aVector; | |
164 | |||
165 | 2 | unsigned int number = 0; | |
166 | 2 | unsigned int eighthPoints = num_points / 8; | |
167 | 2 | unsigned int i = 0; | |
168 | |||
169 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
170 | fzeroes; | ||
171 | __m256 sine, cosine, condition1, condition2; | ||
172 | __m256i q, r, ones, twos, fours; | ||
173 | |||
174 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
175 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
176 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
177 | 2 | ffours = _mm256_set1_ps(4.0); | |
178 | 2 | ftwos = _mm256_set1_ps(2.0); | |
179 | 2 | fones = _mm256_set1_ps(1.0); | |
180 | 2 | fzeroes = _mm256_setzero_ps(); | |
181 | 2 | ones = _mm256_set1_epi32(1); | |
182 | 2 | twos = _mm256_set1_epi32(2); | |
183 | 2 | fours = _mm256_set1_epi32(4); | |
184 | |||
185 | 2 | cp1 = _mm256_set1_ps(1.0); | |
186 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
187 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
188 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
189 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
190 | |||
191 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
192 | 32766 | aVal = _mm256_load_ps(aPtr); | |
193 | 98298 | s = _mm256_sub_ps(aVal, | |
194 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
195 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
196 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
197 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
198 | |||
199 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); | |
200 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); | |
201 | |||
202 | 65532 | s = _mm256_div_ps( | |
203 | s, | ||
204 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
205 | 32766 | s = _mm256_mul_ps(s, s); | |
206 | // Evaluate Taylor series | ||
207 | 131064 | s = _mm256_mul_ps( | |
208 | _mm256_fmadd_ps( | ||
209 | _mm256_fmsub_ps( | ||
210 | _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), | ||
211 | s, | ||
212 | cp1), | ||
213 | s); | ||
214 | |||
215 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
216 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
217 | } | ||
218 | 32766 | s = _mm256_div_ps(s, ftwos); | |
219 | |||
220 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
221 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
222 | |||
223 | 65532 | condition1 = _mm256_cmp_ps( | |
224 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
225 | fzeroes, | ||
226 | _CMP_NEQ_UQ); | ||
227 | 98298 | condition2 = _mm256_cmp_ps( | |
228 | _mm256_cmp_ps( | ||
229 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
230 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
231 | _CMP_NEQ_UQ); | ||
232 | // Need this condition only for cos | ||
233 | // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, | ||
234 | // twos), fours)), fzeroes); | ||
235 | |||
236 | sine = | ||
237 | 98298 | _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); | |
238 | 131064 | sine = _mm256_sub_ps( | |
239 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
240 | _mm256_store_ps(bPtr, sine); | ||
241 | 32766 | aPtr += 8; | |
242 | 32766 | bPtr += 8; | |
243 | } | ||
244 | |||
245 | 2 | number = eighthPoints * 8; | |
246 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
247 | 14 | *bPtr++ = sin(*aPtr++); | |
248 | } | ||
249 | 2 | } | |
250 | |||
251 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ | ||
252 | |||
253 | #ifdef LV_HAVE_AVX2 | ||
254 | #include <immintrin.h> | ||
255 | |||
256 | static inline void | ||
257 | 2 | volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) | |
258 | { | ||
259 | 2 | float* bPtr = bVector; | |
260 | 2 | const float* aPtr = aVector; | |
261 | |||
262 | 2 | unsigned int number = 0; | |
263 | 2 | unsigned int eighthPoints = num_points / 8; | |
264 | 2 | unsigned int i = 0; | |
265 | |||
266 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
267 | fzeroes; | ||
268 | __m256 sine, cosine, condition1, condition2; | ||
269 | __m256i q, r, ones, twos, fours; | ||
270 | |||
271 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
272 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
273 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
274 | 2 | ffours = _mm256_set1_ps(4.0); | |
275 | 2 | ftwos = _mm256_set1_ps(2.0); | |
276 | 2 | fones = _mm256_set1_ps(1.0); | |
277 | 2 | fzeroes = _mm256_setzero_ps(); | |
278 | 2 | ones = _mm256_set1_epi32(1); | |
279 | 2 | twos = _mm256_set1_epi32(2); | |
280 | 2 | fours = _mm256_set1_epi32(4); | |
281 | |||
282 | 2 | cp1 = _mm256_set1_ps(1.0); | |
283 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
284 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
285 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
286 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
287 | |||
288 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
289 | 32766 | aVal = _mm256_load_ps(aPtr); | |
290 | 98298 | s = _mm256_sub_ps(aVal, | |
291 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
292 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
293 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
294 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
295 | |||
296 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); | |
297 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); | |
298 | |||
299 | 65532 | s = _mm256_div_ps( | |
300 | s, | ||
301 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
302 | 32766 | s = _mm256_mul_ps(s, s); | |
303 | // Evaluate Taylor series | ||
304 | 262128 | s = _mm256_mul_ps( | |
305 | _mm256_add_ps( | ||
306 | _mm256_mul_ps( | ||
307 | _mm256_sub_ps( | ||
308 | _mm256_mul_ps( | ||
309 | _mm256_add_ps( | ||
310 | _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), | ||
311 | s), | ||
312 | cp3), | ||
313 | s), | ||
314 | cp2), | ||
315 | s), | ||
316 | cp1), | ||
317 | s); | ||
318 | |||
319 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
320 | 294894 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
321 | } | ||
322 | 32766 | s = _mm256_div_ps(s, ftwos); | |
323 | |||
324 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
325 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
326 | |||
327 | 65532 | condition1 = _mm256_cmp_ps( | |
328 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
329 | fzeroes, | ||
330 | _CMP_NEQ_UQ); | ||
331 | 98298 | condition2 = _mm256_cmp_ps( | |
332 | _mm256_cmp_ps( | ||
333 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
334 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
335 | _CMP_NEQ_UQ); | ||
336 | // Need this condition only for cos | ||
337 | // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, | ||
338 | // twos), fours)), fzeroes); | ||
339 | |||
340 | sine = | ||
341 | 98298 | _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); | |
342 | 131064 | sine = _mm256_sub_ps( | |
343 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
344 | _mm256_store_ps(bPtr, sine); | ||
345 | 32766 | aPtr += 8; | |
346 | 32766 | bPtr += 8; | |
347 | } | ||
348 | |||
349 | 2 | number = eighthPoints * 8; | |
350 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
351 | 14 | *bPtr++ = sin(*aPtr++); | |
352 | } | ||
353 | 2 | } | |
354 | |||
355 | #endif /* LV_HAVE_AVX2 for aligned */ | ||
356 | |||
357 | #ifdef LV_HAVE_SSE4_1 | ||
358 | #include <smmintrin.h> | ||
359 | |||
360 | static inline void | ||
361 | 2 | volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) | |
362 | { | ||
363 | 2 | float* bPtr = bVector; | |
364 | 2 | const float* aPtr = aVector; | |
365 | |||
366 | 2 | unsigned int number = 0; | |
367 | 2 | unsigned int quarterPoints = num_points / 4; | |
368 | 2 | unsigned int i = 0; | |
369 | |||
370 | __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
371 | fzeroes; | ||
372 | __m128 sine, cosine, condition1, condition2; | ||
373 | __m128i q, r, ones, twos, fours; | ||
374 | |||
375 | 2 | m4pi = _mm_set1_ps(1.273239545); | |
376 | 2 | pio4A = _mm_set1_ps(0.78515625); | |
377 | 2 | pio4B = _mm_set1_ps(0.241876e-3); | |
378 | 2 | ffours = _mm_set1_ps(4.0); | |
379 | 2 | ftwos = _mm_set1_ps(2.0); | |
380 | 2 | fones = _mm_set1_ps(1.0); | |
381 | 2 | fzeroes = _mm_setzero_ps(); | |
382 | 2 | ones = _mm_set1_epi32(1); | |
383 | 2 | twos = _mm_set1_epi32(2); | |
384 | 2 | fours = _mm_set1_epi32(4); | |
385 | |||
386 | 2 | cp1 = _mm_set1_ps(1.0); | |
387 | 2 | cp2 = _mm_set1_ps(0.83333333e-1); | |
388 | 2 | cp3 = _mm_set1_ps(0.2777778e-2); | |
389 | 2 | cp4 = _mm_set1_ps(0.49603e-4); | |
390 | 2 | cp5 = _mm_set1_ps(0.551e-6); | |
391 | |||
392 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
393 | 65534 | aVal = _mm_load_ps(aPtr); | |
394 | 262136 | s = _mm_sub_ps(aVal, | |
395 | _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); | ||
396 | 131068 | q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); | |
397 | 131068 | r = _mm_add_epi32(q, _mm_and_si128(q, ones)); | |
398 | |||
399 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); | |
400 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); | |
401 | |||
402 | 131068 | s = _mm_div_ps( | |
403 | s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
404 | 65534 | s = _mm_mul_ps(s, s); | |
405 | // Evaluate Taylor series | ||
406 | 524272 | s = _mm_mul_ps( | |
407 | _mm_add_ps( | ||
408 | _mm_mul_ps( | ||
409 | _mm_sub_ps( | ||
410 | _mm_mul_ps( | ||
411 | _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), | ||
412 | cp3), | ||
413 | s), | ||
414 | cp2), | ||
415 | s), | ||
416 | cp1), | ||
417 | s); | ||
418 | |||
419 |
2/2✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
|
262136 | for (i = 0; i < 3; i++) { |
420 | 393204 | s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); | |
421 | } | ||
422 | 65534 | s = _mm_div_ps(s, ftwos); | |
423 | |||
424 | 196602 | sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); | |
425 | 65534 | cosine = _mm_sub_ps(fones, s); | |
426 | |||
427 | 262136 | condition1 = _mm_cmpneq_ps( | |
428 | _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); | ||
429 | 327670 | condition2 = _mm_cmpneq_ps( | |
430 | _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), | ||
431 | _mm_cmplt_ps(aVal, fzeroes)); | ||
432 | // Need this condition only for cos | ||
433 | // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, | ||
434 | // twos), fours)), fzeroes); | ||
435 | |||
436 | 196602 | sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); | |
437 | sine = | ||
438 | 262136 | _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); | |
439 | _mm_store_ps(bPtr, sine); | ||
440 | 65534 | aPtr += 4; | |
441 | 65534 | bPtr += 4; | |
442 | } | ||
443 | |||
444 | 2 | number = quarterPoints * 4; | |
445 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
446 | 6 | *bPtr++ = sinf(*aPtr++); | |
447 | } | ||
448 | 2 | } | |
449 | |||
450 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
451 | |||
452 | |||
453 | #endif /* INCLUDED_volk_32f_sin_32f_a_H */ | ||
454 | |||
455 | #ifndef INCLUDED_volk_32f_sin_32f_u_H | ||
456 | #define INCLUDED_volk_32f_sin_32f_u_H | ||
457 | |||
458 | #ifdef LV_HAVE_AVX512F | ||
459 | |||
460 | #include <immintrin.h> | ||
461 | ✗ | static inline void volk_32f_sin_32f_u_avx512f(float* sinVector, | |
462 | const float* inVector, | ||
463 | unsigned int num_points) | ||
464 | { | ||
465 | ✗ | float* sinPtr = sinVector; | |
466 | ✗ | const float* inPtr = inVector; | |
467 | |||
468 | ✗ | unsigned int number = 0; | |
469 | ✗ | unsigned int sixteenPoints = num_points / 16; | |
470 | ✗ | unsigned int i = 0; | |
471 | |||
472 | __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, | ||
473 | fones; | ||
474 | __m512 sine, cosine; | ||
475 | __m512i q, zeros, ones, twos, fours; | ||
476 | |||
477 | ✗ | m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125); | |
478 | ✗ | pio4A = _mm512_set1_ps(0.7853981554508209228515625); | |
479 | ✗ | pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8); | |
480 | ✗ | pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); | |
481 | ✗ | ffours = _mm512_set1_ps(4.0); | |
482 | ✗ | ftwos = _mm512_set1_ps(2.0); | |
483 | ✗ | fones = _mm512_set1_ps(1.0); | |
484 | ✗ | zeros = _mm512_setzero_epi32(); | |
485 | ✗ | ones = _mm512_set1_epi32(1); | |
486 | ✗ | twos = _mm512_set1_epi32(2); | |
487 | ✗ | fours = _mm512_set1_epi32(4); | |
488 | |||
489 | ✗ | cp1 = _mm512_set1_ps(1.0); | |
490 | ✗ | cp2 = _mm512_set1_ps(0.08333333333333333); | |
491 | ✗ | cp3 = _mm512_set1_ps(0.002777777777777778); | |
492 | ✗ | cp4 = _mm512_set1_ps(4.96031746031746e-05); | |
493 | ✗ | cp5 = _mm512_set1_ps(5.511463844797178e-07); | |
494 | __mmask16 condition1, condition2, ltZero; | ||
495 | |||
496 | ✗ | for (; number < sixteenPoints; number++) { | |
497 | ✗ | aVal = _mm512_loadu_ps(inPtr); | |
498 | // s = fabs(aVal) | ||
499 | ✗ | s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff))); | |
500 | |||
501 | // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) | ||
502 | ✗ | q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi))); | |
503 | // r = q + q&1, q indicates quadrant, r gives | ||
504 | ✗ | r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones))); | |
505 | |||
506 | ✗ | s = _mm512_fnmadd_ps(r, pio4A, s); | |
507 | ✗ | s = _mm512_fnmadd_ps(r, pio4B, s); | |
508 | ✗ | s = _mm512_fnmadd_ps(r, pio4C, s); | |
509 | |||
510 | ✗ | s = _mm512_div_ps( | |
511 | s, | ||
512 | _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction | ||
513 | ✗ | s = _mm512_mul_ps(s, s); | |
514 | // Evaluate Taylor series | ||
515 | ✗ | s = _mm512_mul_ps( | |
516 | _mm512_fmadd_ps( | ||
517 | _mm512_fmsub_ps( | ||
518 | _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), | ||
519 | s, | ||
520 | cp1), | ||
521 | s); | ||
522 | |||
523 | ✗ | for (i = 0; i < 3; i++) | |
524 | ✗ | s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); | |
525 | ✗ | s = _mm512_div_ps(s, ftwos); | |
526 | |||
527 | ✗ | sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); | |
528 | ✗ | cosine = _mm512_sub_ps(fones, s); | |
529 | |||
530 | ✗ | condition1 = _mm512_cmpneq_epi32_mask( | |
531 | _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros); | ||
532 | ✗ | ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS); | |
533 | ✗ | condition2 = _mm512_kxor( | |
534 | ✗ | _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero); | |
535 | |||
536 | ✗ | sine = _mm512_mask_blend_ps(condition1, sine, cosine); | |
537 | ✗ | sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f)); | |
538 | _mm512_storeu_ps(sinPtr, sine); | ||
539 | ✗ | inPtr += 16; | |
540 | ✗ | sinPtr += 16; | |
541 | } | ||
542 | |||
543 | ✗ | number = sixteenPoints * 16; | |
544 | ✗ | for (; number < num_points; number++) { | |
545 | ✗ | *sinPtr++ = sinf(*inPtr++); | |
546 | } | ||
547 | ✗ | } | |
548 | #endif | ||
549 | |||
550 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
551 | #include <immintrin.h> | ||
552 | |||
553 | static inline void | ||
554 | 2 | volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) | |
555 | { | ||
556 | 2 | float* bPtr = bVector; | |
557 | 2 | const float* aPtr = aVector; | |
558 | |||
559 | 2 | unsigned int number = 0; | |
560 | 2 | unsigned int eighthPoints = num_points / 8; | |
561 | 2 | unsigned int i = 0; | |
562 | |||
563 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
564 | fzeroes; | ||
565 | __m256 sine, cosine, condition1, condition2; | ||
566 | __m256i q, r, ones, twos, fours; | ||
567 | |||
568 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
569 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
570 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
571 | 2 | ffours = _mm256_set1_ps(4.0); | |
572 | 2 | ftwos = _mm256_set1_ps(2.0); | |
573 | 2 | fones = _mm256_set1_ps(1.0); | |
574 | 2 | fzeroes = _mm256_setzero_ps(); | |
575 | 2 | ones = _mm256_set1_epi32(1); | |
576 | 2 | twos = _mm256_set1_epi32(2); | |
577 | 2 | fours = _mm256_set1_epi32(4); | |
578 | |||
579 | 2 | cp1 = _mm256_set1_ps(1.0); | |
580 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
581 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
582 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
583 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
584 | |||
585 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
586 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
587 | 98298 | s = _mm256_sub_ps(aVal, | |
588 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
589 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
590 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
591 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
592 | |||
593 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); | |
594 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); | |
595 | |||
596 | 65532 | s = _mm256_div_ps( | |
597 | s, | ||
598 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
599 | 32766 | s = _mm256_mul_ps(s, s); | |
600 | // Evaluate Taylor series | ||
601 | 131064 | s = _mm256_mul_ps( | |
602 | _mm256_fmadd_ps( | ||
603 | _mm256_fmsub_ps( | ||
604 | _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), | ||
605 | s, | ||
606 | cp1), | ||
607 | s); | ||
608 | |||
609 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
610 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
611 | } | ||
612 | 32766 | s = _mm256_div_ps(s, ftwos); | |
613 | |||
614 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
615 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
616 | |||
617 | 65532 | condition1 = _mm256_cmp_ps( | |
618 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
619 | fzeroes, | ||
620 | _CMP_NEQ_UQ); | ||
621 | 98298 | condition2 = _mm256_cmp_ps( | |
622 | _mm256_cmp_ps( | ||
623 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
624 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
625 | _CMP_NEQ_UQ); | ||
626 | // Need this condition only for cos | ||
627 | // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, | ||
628 | // twos), fours)), fzeroes); | ||
629 | |||
630 | sine = | ||
631 | 98298 | _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); | |
632 | 131064 | sine = _mm256_sub_ps( | |
633 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
634 | _mm256_storeu_ps(bPtr, sine); | ||
635 | 32766 | aPtr += 8; | |
636 | 32766 | bPtr += 8; | |
637 | } | ||
638 | |||
639 | 2 | number = eighthPoints * 8; | |
640 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
641 | 14 | *bPtr++ = sin(*aPtr++); | |
642 | } | ||
643 | 2 | } | |
644 | |||
645 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ | ||
646 | |||
647 | #ifdef LV_HAVE_AVX2 | ||
648 | #include <immintrin.h> | ||
649 | |||
650 | static inline void | ||
651 | 2 | volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) | |
652 | { | ||
653 | 2 | float* bPtr = bVector; | |
654 | 2 | const float* aPtr = aVector; | |
655 | |||
656 | 2 | unsigned int number = 0; | |
657 | 2 | unsigned int eighthPoints = num_points / 8; | |
658 | 2 | unsigned int i = 0; | |
659 | |||
660 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
661 | fzeroes; | ||
662 | __m256 sine, cosine, condition1, condition2; | ||
663 | __m256i q, r, ones, twos, fours; | ||
664 | |||
665 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
666 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
667 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
668 | 2 | ffours = _mm256_set1_ps(4.0); | |
669 | 2 | ftwos = _mm256_set1_ps(2.0); | |
670 | 2 | fones = _mm256_set1_ps(1.0); | |
671 | 2 | fzeroes = _mm256_setzero_ps(); | |
672 | 2 | ones = _mm256_set1_epi32(1); | |
673 | 2 | twos = _mm256_set1_epi32(2); | |
674 | 2 | fours = _mm256_set1_epi32(4); | |
675 | |||
676 | 2 | cp1 = _mm256_set1_ps(1.0); | |
677 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
678 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
679 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
680 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
681 | |||
682 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
683 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
684 | 98298 | s = _mm256_sub_ps(aVal, | |
685 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
686 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
687 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
688 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
689 | |||
690 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); | |
691 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); | |
692 | |||
693 | 65532 | s = _mm256_div_ps( | |
694 | s, | ||
695 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
696 | 32766 | s = _mm256_mul_ps(s, s); | |
697 | // Evaluate Taylor series | ||
698 | 262128 | s = _mm256_mul_ps( | |
699 | _mm256_add_ps( | ||
700 | _mm256_mul_ps( | ||
701 | _mm256_sub_ps( | ||
702 | _mm256_mul_ps( | ||
703 | _mm256_add_ps( | ||
704 | _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), | ||
705 | s), | ||
706 | cp3), | ||
707 | s), | ||
708 | cp2), | ||
709 | s), | ||
710 | cp1), | ||
711 | s); | ||
712 | |||
713 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
714 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
715 | } | ||
716 | 32766 | s = _mm256_div_ps(s, ftwos); | |
717 | |||
718 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
719 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
720 | |||
721 | 65532 | condition1 = _mm256_cmp_ps( | |
722 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
723 | fzeroes, | ||
724 | _CMP_NEQ_UQ); | ||
725 | 98298 | condition2 = _mm256_cmp_ps( | |
726 | _mm256_cmp_ps( | ||
727 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
728 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
729 | _CMP_NEQ_UQ); | ||
730 | // Need this condition only for cos | ||
731 | // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, | ||
732 | // twos), fours)), fzeroes); | ||
733 | |||
734 | sine = | ||
735 | 98298 | _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); | |
736 | 131064 | sine = _mm256_sub_ps( | |
737 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
738 | _mm256_storeu_ps(bPtr, sine); | ||
739 | 32766 | aPtr += 8; | |
740 | 32766 | bPtr += 8; | |
741 | } | ||
742 | |||
743 | 2 | number = eighthPoints * 8; | |
744 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
745 | 14 | *bPtr++ = sin(*aPtr++); | |
746 | } | ||
747 | 2 | } | |
748 | |||
749 | #endif /* LV_HAVE_AVX2 for unaligned */ | ||
750 | |||
751 | |||
752 | #ifdef LV_HAVE_SSE4_1 | ||
753 | #include <smmintrin.h> | ||
754 | |||
755 | static inline void | ||
756 | 2 | volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) | |
757 | { | ||
758 | 2 | float* bPtr = bVector; | |
759 | 2 | const float* aPtr = aVector; | |
760 | |||
761 | 2 | unsigned int number = 0; | |
762 | 2 | unsigned int quarterPoints = num_points / 4; | |
763 | 2 | unsigned int i = 0; | |
764 | |||
765 | __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
766 | fzeroes; | ||
767 | __m128 sine, cosine, condition1, condition2; | ||
768 | __m128i q, r, ones, twos, fours; | ||
769 | |||
770 | 2 | m4pi = _mm_set1_ps(1.273239545); | |
771 | 2 | pio4A = _mm_set1_ps(0.78515625); | |
772 | 2 | pio4B = _mm_set1_ps(0.241876e-3); | |
773 | 2 | ffours = _mm_set1_ps(4.0); | |
774 | 2 | ftwos = _mm_set1_ps(2.0); | |
775 | 2 | fones = _mm_set1_ps(1.0); | |
776 | 2 | fzeroes = _mm_setzero_ps(); | |
777 | 2 | ones = _mm_set1_epi32(1); | |
778 | 2 | twos = _mm_set1_epi32(2); | |
779 | 2 | fours = _mm_set1_epi32(4); | |
780 | |||
781 | 2 | cp1 = _mm_set1_ps(1.0); | |
782 | 2 | cp2 = _mm_set1_ps(0.83333333e-1); | |
783 | 2 | cp3 = _mm_set1_ps(0.2777778e-2); | |
784 | 2 | cp4 = _mm_set1_ps(0.49603e-4); | |
785 | 2 | cp5 = _mm_set1_ps(0.551e-6); | |
786 | |||
787 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
788 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
789 | 262136 | s = _mm_sub_ps(aVal, | |
790 | _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); | ||
791 | 131068 | q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); | |
792 | 131068 | r = _mm_add_epi32(q, _mm_and_si128(q, ones)); | |
793 | |||
794 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); | |
795 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); | |
796 | |||
797 | 131068 | s = _mm_div_ps( | |
798 | s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
799 | 65534 | s = _mm_mul_ps(s, s); | |
800 | // Evaluate Taylor series | ||
801 | 524272 | s = _mm_mul_ps( | |
802 | _mm_add_ps( | ||
803 | _mm_mul_ps( | ||
804 | _mm_sub_ps( | ||
805 | _mm_mul_ps( | ||
806 | _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), | ||
807 | cp3), | ||
808 | s), | ||
809 | cp2), | ||
810 | s), | ||
811 | cp1), | ||
812 | s); | ||
813 | |||
814 |
2/2✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
|
262136 | for (i = 0; i < 3; i++) { |
815 | 393204 | s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); | |
816 | } | ||
817 | 65534 | s = _mm_div_ps(s, ftwos); | |
818 | |||
819 | 196602 | sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); | |
820 | 65534 | cosine = _mm_sub_ps(fones, s); | |
821 | |||
822 | 262136 | condition1 = _mm_cmpneq_ps( | |
823 | _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); | ||
824 | 327670 | condition2 = _mm_cmpneq_ps( | |
825 | _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), | ||
826 | _mm_cmplt_ps(aVal, fzeroes)); | ||
827 | |||
828 | 196602 | sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); | |
829 | sine = | ||
830 | 262136 | _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); | |
831 | _mm_storeu_ps(bPtr, sine); | ||
832 | 65534 | aPtr += 4; | |
833 | 65534 | bPtr += 4; | |
834 | } | ||
835 | |||
836 | 2 | number = quarterPoints * 4; | |
837 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
838 | 6 | *bPtr++ = sinf(*aPtr++); | |
839 | } | ||
840 | 2 | } | |
841 | |||
842 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
843 | |||
844 | |||
845 | #ifdef LV_HAVE_GENERIC | ||
846 | |||
847 | static inline void | ||
848 | 2 | volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points) | |
849 | { | ||
850 | 2 | float* bPtr = bVector; | |
851 | 2 | const float* aPtr = aVector; | |
852 | 2 | unsigned int number = 0; | |
853 | |||
854 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
855 | 262142 | *bPtr++ = sinf(*aPtr++); | |
856 | } | ||
857 | 2 | } | |
858 | |||
859 | #endif /* LV_HAVE_GENERIC */ | ||
860 | |||
861 | |||
862 | #ifdef LV_HAVE_NEON | ||
863 | #include <arm_neon.h> | ||
864 | #include <volk/volk_neon_intrinsics.h> | ||
865 | |||
866 | static inline void | ||
867 | volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points) | ||
868 | { | ||
869 | unsigned int number = 0; | ||
870 | unsigned int quarter_points = num_points / 4; | ||
871 | float* bVectorPtr = bVector; | ||
872 | const float* aVectorPtr = aVector; | ||
873 | |||
874 | float32x4_t b_vec; | ||
875 | float32x4_t a_vec; | ||
876 | |||
877 | for (number = 0; number < quarter_points; number++) { | ||
878 | a_vec = vld1q_f32(aVectorPtr); | ||
879 | // Prefetch next one, speeds things up | ||
880 | __VOLK_PREFETCH(aVectorPtr + 4); | ||
881 | b_vec = _vsinq_f32(a_vec); | ||
882 | vst1q_f32(bVectorPtr, b_vec); | ||
883 | // move pointers ahead | ||
884 | bVectorPtr += 4; | ||
885 | aVectorPtr += 4; | ||
886 | } | ||
887 | |||
888 | // Deal with the rest | ||
889 | for (number = quarter_points * 4; number < num_points; number++) { | ||
890 | *bVectorPtr++ = sinf(*aVectorPtr++); | ||
891 | } | ||
892 | } | ||
893 | |||
894 | #endif /* LV_HAVE_NEON */ | ||
895 | |||
896 | |||
897 | #endif /* INCLUDED_volk_32f_sin_32f_u_H */ | ||
898 |