Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_tan_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the tangent of each element of the aVector. | ||
16 | * | ||
17 | * b[i] = tan(a[i]) | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_32f_tan_32f(float* bVector, const float* aVector, unsigned int num_points) | ||
22 | * \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li aVector: The buffer of points. | ||
26 | * \li num_points: The number of values in input buffer. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li bVector: The output buffer. | ||
30 | * | ||
31 | * \b Example | ||
32 | * Calculate tan(theta) for common angles. | ||
33 | * \code | ||
34 | * int N = 10; | ||
35 | * unsigned int alignment = volk_get_alignment(); | ||
36 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
37 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
38 | * | ||
39 | * in[0] = 0.000; | ||
40 | * in[1] = 0.524; | ||
41 | * in[2] = 0.785; | ||
42 | * in[3] = 1.047; | ||
43 | * in[4] = 1.571 ; | ||
44 | * in[5] = 1.571 ; | ||
45 | * in[6] = -1.047; | ||
46 | * in[7] = -0.785; | ||
47 | * in[8] = -0.524; | ||
48 | * in[9] = -0.000; | ||
49 | * | ||
50 | * volk_32f_tan_32f(out, in, N); | ||
51 | * | ||
52 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
53 | * printf("tan(%1.3f) = %1.3f\n", in[ii], out[ii]); | ||
54 | * } | ||
55 | * | ||
56 | * volk_free(in); | ||
57 | * volk_free(out); | ||
58 | * \endcode | ||
59 | */ | ||
60 | |||
61 | #include <inttypes.h> | ||
62 | #include <math.h> | ||
63 | #include <stdio.h> | ||
64 | |||
65 | #ifndef INCLUDED_volk_32f_tan_32f_a_H | ||
66 | #define INCLUDED_volk_32f_tan_32f_a_H | ||
67 | |||
68 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
69 | #include <immintrin.h> | ||
70 | |||
71 | static inline void | ||
72 | 2 | volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) | |
73 | { | ||
74 | 2 | float* bPtr = bVector; | |
75 | 2 | const float* aPtr = aVector; | |
76 | |||
77 | 2 | unsigned int number = 0; | |
78 | 2 | unsigned int eighthPoints = num_points / 8; | |
79 | 2 | unsigned int i = 0; | |
80 | |||
81 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
82 | fzeroes; | ||
83 | __m256 sine, cosine, tangent, condition1, condition2, condition3; | ||
84 | __m256i q, r, ones, twos, fours; | ||
85 | |||
86 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
87 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
88 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
89 | 2 | ffours = _mm256_set1_ps(4.0); | |
90 | 2 | ftwos = _mm256_set1_ps(2.0); | |
91 | 2 | fones = _mm256_set1_ps(1.0); | |
92 | 2 | fzeroes = _mm256_setzero_ps(); | |
93 | 2 | ones = _mm256_set1_epi32(1); | |
94 | 2 | twos = _mm256_set1_epi32(2); | |
95 | 2 | fours = _mm256_set1_epi32(4); | |
96 | |||
97 | 2 | cp1 = _mm256_set1_ps(1.0); | |
98 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
99 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
100 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
101 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
102 | |||
103 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
104 | 32766 | aVal = _mm256_load_ps(aPtr); | |
105 | 98298 | s = _mm256_sub_ps(aVal, | |
106 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
107 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
108 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
109 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
110 | |||
111 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); | |
112 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); | |
113 | |||
114 | 65532 | s = _mm256_div_ps( | |
115 | s, | ||
116 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
117 | 32766 | s = _mm256_mul_ps(s, s); | |
118 | // Evaluate Taylor series | ||
119 | 131064 | s = _mm256_mul_ps( | |
120 | _mm256_fmadd_ps( | ||
121 | _mm256_fmsub_ps( | ||
122 | _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), | ||
123 | s, | ||
124 | cp1), | ||
125 | s); | ||
126 | |||
127 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
128 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
129 | } | ||
130 | 32766 | s = _mm256_div_ps(s, ftwos); | |
131 | |||
132 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
133 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
134 | |||
135 | 65532 | condition1 = _mm256_cmp_ps( | |
136 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
137 | fzeroes, | ||
138 | _CMP_NEQ_UQ); | ||
139 | 98298 | condition2 = _mm256_cmp_ps( | |
140 | _mm256_cmp_ps( | ||
141 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
142 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
143 | _CMP_NEQ_UQ); | ||
144 | 65532 | condition3 = _mm256_cmp_ps( | |
145 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), | ||
146 | fzeroes, | ||
147 | _CMP_NEQ_UQ); | ||
148 | |||
149 | 32766 | __m256 temp = cosine; | |
150 | cosine = | ||
151 | 98298 | _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); | |
152 | 98298 | sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); | |
153 | 131064 | sine = _mm256_sub_ps( | |
154 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
155 | 131064 | cosine = _mm256_sub_ps( | |
156 | cosine, | ||
157 | _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); | ||
158 | 32766 | tangent = _mm256_div_ps(sine, cosine); | |
159 | _mm256_store_ps(bPtr, tangent); | ||
160 | 32766 | aPtr += 8; | |
161 | 32766 | bPtr += 8; | |
162 | } | ||
163 | |||
164 | 2 | number = eighthPoints * 8; | |
165 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
166 | 14 | *bPtr++ = tan(*aPtr++); | |
167 | } | ||
168 | 2 | } | |
169 | |||
170 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ | ||
171 | |||
172 | #ifdef LV_HAVE_AVX2 | ||
173 | #include <immintrin.h> | ||
174 | |||
175 | static inline void | ||
176 | 2 | volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) | |
177 | { | ||
178 | 2 | float* bPtr = bVector; | |
179 | 2 | const float* aPtr = aVector; | |
180 | |||
181 | 2 | unsigned int number = 0; | |
182 | 2 | unsigned int eighthPoints = num_points / 8; | |
183 | 2 | unsigned int i = 0; | |
184 | |||
185 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
186 | fzeroes; | ||
187 | __m256 sine, cosine, tangent, condition1, condition2, condition3; | ||
188 | __m256i q, r, ones, twos, fours; | ||
189 | |||
190 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
191 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
192 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
193 | 2 | ffours = _mm256_set1_ps(4.0); | |
194 | 2 | ftwos = _mm256_set1_ps(2.0); | |
195 | 2 | fones = _mm256_set1_ps(1.0); | |
196 | 2 | fzeroes = _mm256_setzero_ps(); | |
197 | 2 | ones = _mm256_set1_epi32(1); | |
198 | 2 | twos = _mm256_set1_epi32(2); | |
199 | 2 | fours = _mm256_set1_epi32(4); | |
200 | |||
201 | 2 | cp1 = _mm256_set1_ps(1.0); | |
202 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
203 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
204 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
205 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
206 | |||
207 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
208 | 32766 | aVal = _mm256_load_ps(aPtr); | |
209 | 98298 | s = _mm256_sub_ps(aVal, | |
210 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
211 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
212 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
213 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
214 | |||
215 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); | |
216 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); | |
217 | |||
218 | 65532 | s = _mm256_div_ps( | |
219 | s, | ||
220 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
221 | 32766 | s = _mm256_mul_ps(s, s); | |
222 | // Evaluate Taylor series | ||
223 | 262128 | s = _mm256_mul_ps( | |
224 | _mm256_add_ps( | ||
225 | _mm256_mul_ps( | ||
226 | _mm256_sub_ps( | ||
227 | _mm256_mul_ps( | ||
228 | _mm256_add_ps( | ||
229 | _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), | ||
230 | s), | ||
231 | cp3), | ||
232 | s), | ||
233 | cp2), | ||
234 | s), | ||
235 | cp1), | ||
236 | s); | ||
237 | |||
238 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
239 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
240 | } | ||
241 | 32766 | s = _mm256_div_ps(s, ftwos); | |
242 | |||
243 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
244 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
245 | |||
246 | 65532 | condition1 = _mm256_cmp_ps( | |
247 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
248 | fzeroes, | ||
249 | _CMP_NEQ_UQ); | ||
250 | 98298 | condition2 = _mm256_cmp_ps( | |
251 | _mm256_cmp_ps( | ||
252 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
253 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
254 | _CMP_NEQ_UQ); | ||
255 | 65532 | condition3 = _mm256_cmp_ps( | |
256 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), | ||
257 | fzeroes, | ||
258 | _CMP_NEQ_UQ); | ||
259 | |||
260 | 32766 | __m256 temp = cosine; | |
261 | cosine = | ||
262 | 98298 | _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); | |
263 | 98298 | sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); | |
264 | 131064 | sine = _mm256_sub_ps( | |
265 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
266 | 131064 | cosine = _mm256_sub_ps( | |
267 | cosine, | ||
268 | _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); | ||
269 | 32766 | tangent = _mm256_div_ps(sine, cosine); | |
270 | _mm256_store_ps(bPtr, tangent); | ||
271 | 32766 | aPtr += 8; | |
272 | 32766 | bPtr += 8; | |
273 | } | ||
274 | |||
275 | 2 | number = eighthPoints * 8; | |
276 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
277 | 14 | *bPtr++ = tan(*aPtr++); | |
278 | } | ||
279 | 2 | } | |
280 | |||
281 | #endif /* LV_HAVE_AVX2 for aligned */ | ||
282 | |||
283 | #ifdef LV_HAVE_SSE4_1 | ||
284 | #include <smmintrin.h> | ||
285 | |||
286 | static inline void | ||
287 | 2 | volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) | |
288 | { | ||
289 | 2 | float* bPtr = bVector; | |
290 | 2 | const float* aPtr = aVector; | |
291 | |||
292 | 2 | unsigned int number = 0; | |
293 | 2 | unsigned int quarterPoints = num_points / 4; | |
294 | 2 | unsigned int i = 0; | |
295 | |||
296 | __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
297 | fzeroes; | ||
298 | __m128 sine, cosine, tangent, condition1, condition2, condition3; | ||
299 | __m128i q, r, ones, twos, fours; | ||
300 | |||
301 | 2 | m4pi = _mm_set1_ps(1.273239545); | |
302 | 2 | pio4A = _mm_set1_ps(0.78515625); | |
303 | 2 | pio4B = _mm_set1_ps(0.241876e-3); | |
304 | 2 | ffours = _mm_set1_ps(4.0); | |
305 | 2 | ftwos = _mm_set1_ps(2.0); | |
306 | 2 | fones = _mm_set1_ps(1.0); | |
307 | 2 | fzeroes = _mm_setzero_ps(); | |
308 | 2 | ones = _mm_set1_epi32(1); | |
309 | 2 | twos = _mm_set1_epi32(2); | |
310 | 2 | fours = _mm_set1_epi32(4); | |
311 | |||
312 | 2 | cp1 = _mm_set1_ps(1.0); | |
313 | 2 | cp2 = _mm_set1_ps(0.83333333e-1); | |
314 | 2 | cp3 = _mm_set1_ps(0.2777778e-2); | |
315 | 2 | cp4 = _mm_set1_ps(0.49603e-4); | |
316 | 2 | cp5 = _mm_set1_ps(0.551e-6); | |
317 | |||
318 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
319 | 65534 | aVal = _mm_load_ps(aPtr); | |
320 | 262136 | s = _mm_sub_ps(aVal, | |
321 | _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); | ||
322 | 131068 | q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); | |
323 | 131068 | r = _mm_add_epi32(q, _mm_and_si128(q, ones)); | |
324 | |||
325 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); | |
326 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); | |
327 | |||
328 | 131068 | s = _mm_div_ps( | |
329 | s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
330 | 65534 | s = _mm_mul_ps(s, s); | |
331 | // Evaluate Taylor series | ||
332 | 524272 | s = _mm_mul_ps( | |
333 | _mm_add_ps( | ||
334 | _mm_mul_ps( | ||
335 | _mm_sub_ps( | ||
336 | _mm_mul_ps( | ||
337 | _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), | ||
338 | cp3), | ||
339 | s), | ||
340 | cp2), | ||
341 | s), | ||
342 | cp1), | ||
343 | s); | ||
344 | |||
345 |
2/2✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
|
262136 | for (i = 0; i < 3; i++) { |
346 | 393204 | s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); | |
347 | } | ||
348 | 65534 | s = _mm_div_ps(s, ftwos); | |
349 | |||
350 | 196602 | sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); | |
351 | 65534 | cosine = _mm_sub_ps(fones, s); | |
352 | |||
353 | 262136 | condition1 = _mm_cmpneq_ps( | |
354 | _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); | ||
355 | 327670 | condition2 = _mm_cmpneq_ps( | |
356 | _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), | ||
357 | _mm_cmplt_ps(aVal, fzeroes)); | ||
358 | 196602 | condition3 = _mm_cmpneq_ps( | |
359 | _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); | ||
360 | |||
361 | 65534 | __m128 temp = cosine; | |
362 | 196602 | cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); | |
363 | 196602 | sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); | |
364 | sine = | ||
365 | 262136 | _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); | |
366 | 262136 | cosine = _mm_sub_ps( | |
367 | cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); | ||
368 | 65534 | tangent = _mm_div_ps(sine, cosine); | |
369 | _mm_store_ps(bPtr, tangent); | ||
370 | 65534 | aPtr += 4; | |
371 | 65534 | bPtr += 4; | |
372 | } | ||
373 | |||
374 | 2 | number = quarterPoints * 4; | |
375 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
376 | 6 | *bPtr++ = tanf(*aPtr++); | |
377 | } | ||
378 | 2 | } | |
379 | |||
380 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
381 | |||
382 | |||
383 | #endif /* INCLUDED_volk_32f_tan_32f_a_H */ | ||
384 | |||
385 | #ifndef INCLUDED_volk_32f_tan_32f_u_H | ||
386 | #define INCLUDED_volk_32f_tan_32f_u_H | ||
387 | |||
388 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
389 | #include <immintrin.h> | ||
390 | |||
391 | static inline void | ||
392 | 2 | volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) | |
393 | { | ||
394 | 2 | float* bPtr = bVector; | |
395 | 2 | const float* aPtr = aVector; | |
396 | |||
397 | 2 | unsigned int number = 0; | |
398 | 2 | unsigned int eighthPoints = num_points / 8; | |
399 | 2 | unsigned int i = 0; | |
400 | |||
401 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
402 | fzeroes; | ||
403 | __m256 sine, cosine, tangent, condition1, condition2, condition3; | ||
404 | __m256i q, r, ones, twos, fours; | ||
405 | |||
406 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
407 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
408 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
409 | 2 | ffours = _mm256_set1_ps(4.0); | |
410 | 2 | ftwos = _mm256_set1_ps(2.0); | |
411 | 2 | fones = _mm256_set1_ps(1.0); | |
412 | 2 | fzeroes = _mm256_setzero_ps(); | |
413 | 2 | ones = _mm256_set1_epi32(1); | |
414 | 2 | twos = _mm256_set1_epi32(2); | |
415 | 2 | fours = _mm256_set1_epi32(4); | |
416 | |||
417 | 2 | cp1 = _mm256_set1_ps(1.0); | |
418 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
419 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
420 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
421 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
422 | |||
423 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
424 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
425 | 98298 | s = _mm256_sub_ps(aVal, | |
426 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
427 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
428 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
429 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
430 | |||
431 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); | |
432 | 65532 | s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); | |
433 | |||
434 | 65532 | s = _mm256_div_ps( | |
435 | s, | ||
436 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
437 | 32766 | s = _mm256_mul_ps(s, s); | |
438 | // Evaluate Taylor series | ||
439 | 131064 | s = _mm256_mul_ps( | |
440 | _mm256_fmadd_ps( | ||
441 | _mm256_fmsub_ps( | ||
442 | _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), | ||
443 | s, | ||
444 | cp1), | ||
445 | s); | ||
446 | |||
447 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
448 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
449 | } | ||
450 | 32766 | s = _mm256_div_ps(s, ftwos); | |
451 | |||
452 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
453 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
454 | |||
455 | 65532 | condition1 = _mm256_cmp_ps( | |
456 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
457 | fzeroes, | ||
458 | _CMP_NEQ_UQ); | ||
459 | 98298 | condition2 = _mm256_cmp_ps( | |
460 | _mm256_cmp_ps( | ||
461 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
462 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
463 | _CMP_NEQ_UQ); | ||
464 | 65532 | condition3 = _mm256_cmp_ps( | |
465 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), | ||
466 | fzeroes, | ||
467 | _CMP_NEQ_UQ); | ||
468 | |||
469 | 32766 | __m256 temp = cosine; | |
470 | cosine = | ||
471 | 98298 | _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); | |
472 | 98298 | sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); | |
473 | 131064 | sine = _mm256_sub_ps( | |
474 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
475 | 131064 | cosine = _mm256_sub_ps( | |
476 | cosine, | ||
477 | _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); | ||
478 | 32766 | tangent = _mm256_div_ps(sine, cosine); | |
479 | _mm256_storeu_ps(bPtr, tangent); | ||
480 | 32766 | aPtr += 8; | |
481 | 32766 | bPtr += 8; | |
482 | } | ||
483 | |||
484 | 2 | number = eighthPoints * 8; | |
485 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
486 | 14 | *bPtr++ = tan(*aPtr++); | |
487 | } | ||
488 | 2 | } | |
489 | |||
490 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ | ||
491 | |||
492 | #ifdef LV_HAVE_AVX2 | ||
493 | #include <immintrin.h> | ||
494 | |||
495 | static inline void | ||
496 | 2 | volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) | |
497 | { | ||
498 | 2 | float* bPtr = bVector; | |
499 | 2 | const float* aPtr = aVector; | |
500 | |||
501 | 2 | unsigned int number = 0; | |
502 | 2 | unsigned int eighthPoints = num_points / 8; | |
503 | 2 | unsigned int i = 0; | |
504 | |||
505 | __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
506 | fzeroes; | ||
507 | __m256 sine, cosine, tangent, condition1, condition2, condition3; | ||
508 | __m256i q, r, ones, twos, fours; | ||
509 | |||
510 | 2 | m4pi = _mm256_set1_ps(1.273239545); | |
511 | 2 | pio4A = _mm256_set1_ps(0.78515625); | |
512 | 2 | pio4B = _mm256_set1_ps(0.241876e-3); | |
513 | 2 | ffours = _mm256_set1_ps(4.0); | |
514 | 2 | ftwos = _mm256_set1_ps(2.0); | |
515 | 2 | fones = _mm256_set1_ps(1.0); | |
516 | 2 | fzeroes = _mm256_setzero_ps(); | |
517 | 2 | ones = _mm256_set1_epi32(1); | |
518 | 2 | twos = _mm256_set1_epi32(2); | |
519 | 2 | fours = _mm256_set1_epi32(4); | |
520 | |||
521 | 2 | cp1 = _mm256_set1_ps(1.0); | |
522 | 2 | cp2 = _mm256_set1_ps(0.83333333e-1); | |
523 | 2 | cp3 = _mm256_set1_ps(0.2777778e-2); | |
524 | 2 | cp4 = _mm256_set1_ps(0.49603e-4); | |
525 | 2 | cp5 = _mm256_set1_ps(0.551e-6); | |
526 | |||
527 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
528 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
529 | 98298 | s = _mm256_sub_ps(aVal, | |
530 | _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), | ||
531 | 32766 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); | |
532 | 65532 | q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); | |
533 | 65532 | r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); | |
534 | |||
535 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); | |
536 | 98298 | s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); | |
537 | |||
538 | 65532 | s = _mm256_div_ps( | |
539 | s, | ||
540 | _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
541 | 32766 | s = _mm256_mul_ps(s, s); | |
542 | // Evaluate Taylor series | ||
543 | 262128 | s = _mm256_mul_ps( | |
544 | _mm256_add_ps( | ||
545 | _mm256_mul_ps( | ||
546 | _mm256_sub_ps( | ||
547 | _mm256_mul_ps( | ||
548 | _mm256_add_ps( | ||
549 | _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), | ||
550 | s), | ||
551 | cp3), | ||
552 | s), | ||
553 | cp2), | ||
554 | s), | ||
555 | cp1), | ||
556 | s); | ||
557 | |||
558 |
2/2✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 32766 times.
|
131064 | for (i = 0; i < 3; i++) { |
559 | 196596 | s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); | |
560 | } | ||
561 | 32766 | s = _mm256_div_ps(s, ftwos); | |
562 | |||
563 | 98298 | sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); | |
564 | 32766 | cosine = _mm256_sub_ps(fones, s); | |
565 | |||
566 | 65532 | condition1 = _mm256_cmp_ps( | |
567 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), | ||
568 | fzeroes, | ||
569 | _CMP_NEQ_UQ); | ||
570 | 98298 | condition2 = _mm256_cmp_ps( | |
571 | _mm256_cmp_ps( | ||
572 | _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), | ||
573 | _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), | ||
574 | _CMP_NEQ_UQ); | ||
575 | 65532 | condition3 = _mm256_cmp_ps( | |
576 | _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), | ||
577 | fzeroes, | ||
578 | _CMP_NEQ_UQ); | ||
579 | |||
580 | 32766 | __m256 temp = cosine; | |
581 | cosine = | ||
582 | 98298 | _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); | |
583 | 98298 | sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); | |
584 | 131064 | sine = _mm256_sub_ps( | |
585 | sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); | ||
586 | 131064 | cosine = _mm256_sub_ps( | |
587 | cosine, | ||
588 | _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); | ||
589 | 32766 | tangent = _mm256_div_ps(sine, cosine); | |
590 | _mm256_storeu_ps(bPtr, tangent); | ||
591 | 32766 | aPtr += 8; | |
592 | 32766 | bPtr += 8; | |
593 | } | ||
594 | |||
595 | 2 | number = eighthPoints * 8; | |
596 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
597 | 14 | *bPtr++ = tan(*aPtr++); | |
598 | } | ||
599 | 2 | } | |
600 | |||
601 | #endif /* LV_HAVE_AVX2 for unaligned */ | ||
602 | |||
603 | |||
604 | #ifdef LV_HAVE_SSE4_1 | ||
605 | #include <smmintrin.h> | ||
606 | |||
607 | static inline void | ||
608 | 2 | volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) | |
609 | { | ||
610 | 2 | float* bPtr = bVector; | |
611 | 2 | const float* aPtr = aVector; | |
612 | |||
613 | 2 | unsigned int number = 0; | |
614 | 2 | unsigned int quarterPoints = num_points / 4; | |
615 | 2 | unsigned int i = 0; | |
616 | |||
617 | __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, | ||
618 | fzeroes; | ||
619 | __m128 sine, cosine, tangent, condition1, condition2, condition3; | ||
620 | __m128i q, r, ones, twos, fours; | ||
621 | |||
622 | 2 | m4pi = _mm_set1_ps(1.273239545); | |
623 | 2 | pio4A = _mm_set1_ps(0.78515625); | |
624 | 2 | pio4B = _mm_set1_ps(0.241876e-3); | |
625 | 2 | ffours = _mm_set1_ps(4.0); | |
626 | 2 | ftwos = _mm_set1_ps(2.0); | |
627 | 2 | fones = _mm_set1_ps(1.0); | |
628 | 2 | fzeroes = _mm_setzero_ps(); | |
629 | 2 | ones = _mm_set1_epi32(1); | |
630 | 2 | twos = _mm_set1_epi32(2); | |
631 | 2 | fours = _mm_set1_epi32(4); | |
632 | |||
633 | 2 | cp1 = _mm_set1_ps(1.0); | |
634 | 2 | cp2 = _mm_set1_ps(0.83333333e-1); | |
635 | 2 | cp3 = _mm_set1_ps(0.2777778e-2); | |
636 | 2 | cp4 = _mm_set1_ps(0.49603e-4); | |
637 | 2 | cp5 = _mm_set1_ps(0.551e-6); | |
638 | |||
639 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
640 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
641 | 262136 | s = _mm_sub_ps(aVal, | |
642 | _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); | ||
643 | 131068 | q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); | |
644 | 131068 | r = _mm_add_epi32(q, _mm_and_si128(q, ones)); | |
645 | |||
646 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); | |
647 | 196602 | s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); | |
648 | |||
649 | 131068 | s = _mm_div_ps( | |
650 | s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction | ||
651 | 65534 | s = _mm_mul_ps(s, s); | |
652 | // Evaluate Taylor series | ||
653 | 524272 | s = _mm_mul_ps( | |
654 | _mm_add_ps( | ||
655 | _mm_mul_ps( | ||
656 | _mm_sub_ps( | ||
657 | _mm_mul_ps( | ||
658 | _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), | ||
659 | cp3), | ||
660 | s), | ||
661 | cp2), | ||
662 | s), | ||
663 | cp1), | ||
664 | s); | ||
665 | |||
666 |
2/2✓ Branch 0 taken 196602 times.
✓ Branch 1 taken 65534 times.
|
262136 | for (i = 0; i < 3; i++) { |
667 | 393204 | s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); | |
668 | } | ||
669 | 65534 | s = _mm_div_ps(s, ftwos); | |
670 | |||
671 | 196602 | sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); | |
672 | 65534 | cosine = _mm_sub_ps(fones, s); | |
673 | |||
674 | 262136 | condition1 = _mm_cmpneq_ps( | |
675 | _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); | ||
676 | 327670 | condition2 = _mm_cmpneq_ps( | |
677 | _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), | ||
678 | _mm_cmplt_ps(aVal, fzeroes)); | ||
679 | 196602 | condition3 = _mm_cmpneq_ps( | |
680 | _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); | ||
681 | |||
682 | 65534 | __m128 temp = cosine; | |
683 | 196602 | cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); | |
684 | 196602 | sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); | |
685 | sine = | ||
686 | 262136 | _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); | |
687 | 262136 | cosine = _mm_sub_ps( | |
688 | cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); | ||
689 | 65534 | tangent = _mm_div_ps(sine, cosine); | |
690 | _mm_storeu_ps(bPtr, tangent); | ||
691 | 65534 | aPtr += 4; | |
692 | 65534 | bPtr += 4; | |
693 | } | ||
694 | |||
695 | 2 | number = quarterPoints * 4; | |
696 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
697 | 6 | *bPtr++ = tanf(*aPtr++); | |
698 | } | ||
699 | 2 | } | |
700 | |||
701 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
702 | |||
703 | |||
704 | #ifdef LV_HAVE_GENERIC | ||
705 | |||
706 | static inline void | ||
707 | 2 | volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) | |
708 | { | ||
709 | 2 | float* bPtr = bVector; | |
710 | 2 | const float* aPtr = aVector; | |
711 | 2 | unsigned int number = 0; | |
712 | |||
713 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
714 | 262142 | *bPtr++ = tanf(*aPtr++); | |
715 | } | ||
716 | 2 | } | |
717 | #endif /* LV_HAVE_GENERIC */ | ||
718 | |||
719 | |||
720 | #ifdef LV_HAVE_NEON | ||
721 | #include <arm_neon.h> | ||
722 | #include <volk/volk_neon_intrinsics.h> | ||
723 | |||
724 | static inline void | ||
725 | volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points) | ||
726 | { | ||
727 | unsigned int number = 0; | ||
728 | unsigned int quarter_points = num_points / 4; | ||
729 | float* bVectorPtr = bVector; | ||
730 | const float* aVectorPtr = aVector; | ||
731 | |||
732 | float32x4_t b_vec; | ||
733 | float32x4_t a_vec; | ||
734 | |||
735 | for (number = 0; number < quarter_points; number++) { | ||
736 | a_vec = vld1q_f32(aVectorPtr); | ||
737 | // Prefetch next one, speeds things up | ||
738 | __VOLK_PREFETCH(aVectorPtr + 4); | ||
739 | b_vec = _vtanq_f32(a_vec); | ||
740 | vst1q_f32(bVectorPtr, b_vec); | ||
741 | // move pointers ahead | ||
742 | bVectorPtr += 4; | ||
743 | aVectorPtr += 4; | ||
744 | } | ||
745 | |||
746 | // Deal with the rest | ||
747 | for (number = quarter_points * 4; number < num_points; number++) { | ||
748 | *bVectorPtr++ = tanf(*aVectorPtr++); | ||
749 | } | ||
750 | } | ||
751 | #endif /* LV_HAVE_NEON */ | ||
752 | |||
753 | |||
754 | #endif /* INCLUDED_volk_32f_tan_32f_u_H */ | ||
755 |