Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> | ||
5 | * | ||
6 | * This file is part of VOLK | ||
7 | * | ||
8 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
9 | */ | ||
10 | |||
11 | /*! | ||
12 | * \page volk_32f_atan_32f | ||
13 | * | ||
14 | * \b Overview | ||
15 | * | ||
16 | * Computes arcsine of input vector and stores results in output vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points) | ||
21 | * \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li aVector: The input vector of floats. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li bVector: The vector where results will be stored. | ||
29 | * | ||
30 | * \b Example | ||
31 | * Calculate common angles around the top half of the unit circle. | ||
32 | * \code | ||
33 | * int N = 10; | ||
34 | * unsigned int alignment = volk_get_alignment(); | ||
35 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
37 | * | ||
38 | * in[0] = 0.f; | ||
39 | * in[1] = 1.f/std::sqrt(3.f); | ||
40 | * in[2] = 1.f; | ||
41 | * in[3] = std::sqrt(3.f); | ||
42 | * in[4] = in[5] = 1e99; | ||
43 | * for(unsigned int ii = 6; ii < N; ++ii){ | ||
44 | * in[ii] = - in[N-ii-1]; | ||
45 | * } | ||
46 | * | ||
47 | * volk_32f_atan_32f(out, in, N); | ||
48 | * | ||
49 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
50 | * printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]); | ||
51 | * } | ||
52 | * | ||
53 | * volk_free(in); | ||
54 | * volk_free(out); | ||
55 | * \endcode | ||
56 | */ | ||
57 | #include <math.h> | ||
58 | |||
59 | #ifndef INCLUDED_volk_32f_atan_32f_a_H | ||
60 | #define INCLUDED_volk_32f_atan_32f_a_H | ||
61 | |||
62 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
63 | #include <immintrin.h> | ||
64 | #include <volk/volk_avx2_fma_intrinsics.h> | ||
65 | static inline void | ||
66 | 2 | volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points) | |
67 | { | ||
68 | 2 | const __m256 one = _mm256_set1_ps(1.f); | |
69 | 2 | const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); | |
70 | 4 | const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
71 | 2 | const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
72 | |||
73 | 2 | unsigned int number = 0; | |
74 | 2 | unsigned int eighth_points = num_points / 8; | |
75 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighth_points; number++) { |
76 | 32766 | __m256 x = _mm256_load_ps(in); | |
77 | 32766 | __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); | |
78 | 65532 | __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), | |
79 | _mm256_blendv_ps(one, x, swap_mask)); | ||
80 | 32766 | __m256 result = _m256_arctan_poly_avx2_fma(x_star); | |
81 | 32766 | __m256 term = _mm256_and_ps(x_star, sign_mask); | |
82 | 32766 | term = _mm256_or_ps(pi_over_2, term); | |
83 | 32766 | term = _mm256_sub_ps(term, result); | |
84 | 32766 | result = _mm256_blendv_ps(result, term, swap_mask); | |
85 | _mm256_store_ps(out, result); | ||
86 | 32766 | in += 8; | |
87 | 32766 | out += 8; | |
88 | } | ||
89 | |||
90 | 2 | number = eighth_points * 8; | |
91 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
92 | 14 | *out++ = volk_arctan(*in++); | |
93 | } | ||
94 | 2 | } | |
95 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ | ||
96 | |||
97 | #if LV_HAVE_AVX | ||
98 | #include <immintrin.h> | ||
99 | #include <volk/volk_avx_intrinsics.h> | ||
100 | static inline void | ||
101 | 2 | volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) | |
102 | { | ||
103 | 2 | const __m256 one = _mm256_set1_ps(1.f); | |
104 | 2 | const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); | |
105 | 4 | const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
106 | 2 | const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
107 | |||
108 | 2 | unsigned int number = 0; | |
109 | 2 | unsigned int eighth_points = num_points / 8; | |
110 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighth_points; number++) { |
111 | 32766 | __m256 x = _mm256_load_ps(in); | |
112 | 32766 | __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); | |
113 | 65532 | __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), | |
114 | _mm256_blendv_ps(one, x, swap_mask)); | ||
115 | 32766 | __m256 result = _m256_arctan_poly_avx(x_star); | |
116 | 32766 | __m256 term = _mm256_and_ps(x_star, sign_mask); | |
117 | 32766 | term = _mm256_or_ps(pi_over_2, term); | |
118 | 32766 | term = _mm256_sub_ps(term, result); | |
119 | 32766 | result = _mm256_blendv_ps(result, term, swap_mask); | |
120 | _mm256_store_ps(out, result); | ||
121 | 32766 | in += 8; | |
122 | 32766 | out += 8; | |
123 | } | ||
124 | |||
125 | 2 | number = eighth_points * 8; | |
126 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
127 | 14 | *out++ = volk_arctan(*in++); | |
128 | } | ||
129 | 2 | } | |
130 | #endif /* LV_HAVE_AVX for aligned */ | ||
131 | |||
132 | #ifdef LV_HAVE_SSE4_1 | ||
133 | #include <smmintrin.h> | ||
134 | #include <volk/volk_sse_intrinsics.h> | ||
135 | static inline void | ||
136 | 2 | volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) | |
137 | { | ||
138 | 2 | const __m128 one = _mm_set1_ps(1.f); | |
139 | 2 | const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f); | |
140 | 4 | const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); | |
141 | 2 | const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); | |
142 | |||
143 | 2 | unsigned int number = 0; | |
144 | 2 | unsigned int quarter_points = num_points / 4; | |
145 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarter_points; number++) { |
146 | 65534 | __m128 x = _mm_load_ps(in); | |
147 | 131068 | __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); | |
148 | 131068 | __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), | |
149 | _mm_blendv_ps(one, x, swap_mask)); | ||
150 | 65534 | __m128 result = _mm_arctan_poly_sse(x_star); | |
151 | 65534 | __m128 term = _mm_and_ps(x_star, sign_mask); | |
152 | 65534 | term = _mm_or_ps(pi_over_2, term); | |
153 | 65534 | term = _mm_sub_ps(term, result); | |
154 | 65534 | result = _mm_blendv_ps(result, term, swap_mask); | |
155 | _mm_store_ps(out, result); | ||
156 | 65534 | in += 4; | |
157 | 65534 | out += 4; | |
158 | } | ||
159 | |||
160 | 2 | number = quarter_points * 4; | |
161 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
162 | 6 | *out++ = volk_arctan(*in++); | |
163 | } | ||
164 | 2 | } | |
165 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
166 | #endif /* INCLUDED_volk_32f_atan_32f_a_H */ | ||
167 | |||
168 | #ifndef INCLUDED_volk_32f_atan_32f_u_H | ||
169 | #define INCLUDED_volk_32f_atan_32f_u_H | ||
170 | |||
171 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
172 | #include <immintrin.h> | ||
173 | static inline void | ||
174 | 2 | volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points) | |
175 | { | ||
176 | 2 | const __m256 one = _mm256_set1_ps(1.f); | |
177 | 2 | const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); | |
178 | 4 | const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
179 | 2 | const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
180 | |||
181 | 2 | unsigned int number = 0; | |
182 | 2 | unsigned int eighth_points = num_points / 8; | |
183 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighth_points; number++) { |
184 | 32766 | __m256 x = _mm256_loadu_ps(in); | |
185 | 32766 | __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); | |
186 | 65532 | __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), | |
187 | _mm256_blendv_ps(one, x, swap_mask)); | ||
188 | 32766 | __m256 result = _m256_arctan_poly_avx2_fma(x_star); | |
189 | 32766 | __m256 term = _mm256_and_ps(x_star, sign_mask); | |
190 | 32766 | term = _mm256_or_ps(pi_over_2, term); | |
191 | 32766 | term = _mm256_sub_ps(term, result); | |
192 | 32766 | result = _mm256_blendv_ps(result, term, swap_mask); | |
193 | _mm256_storeu_ps(out, result); | ||
194 | 32766 | in += 8; | |
195 | 32766 | out += 8; | |
196 | } | ||
197 | |||
198 | 2 | number = eighth_points * 8; | |
199 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
200 | 14 | *out++ = volk_arctan(*in++); | |
201 | } | ||
202 | 2 | } | |
203 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ | ||
204 | |||
205 | #if LV_HAVE_AVX | ||
206 | #include <immintrin.h> | ||
207 | static inline void | ||
208 | 2 | volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) | |
209 | { | ||
210 | 2 | const __m256 one = _mm256_set1_ps(1.f); | |
211 | 2 | const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); | |
212 | 4 | const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
213 | 2 | const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
214 | |||
215 | 2 | unsigned int number = 0; | |
216 | 2 | unsigned int eighth_points = num_points / 8; | |
217 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighth_points; number++) { |
218 | 32766 | __m256 x = _mm256_loadu_ps(in); | |
219 | 32766 | __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); | |
220 | 65532 | __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), | |
221 | _mm256_blendv_ps(one, x, swap_mask)); | ||
222 | 32766 | __m256 result = _m256_arctan_poly_avx(x_star); | |
223 | 32766 | __m256 term = _mm256_and_ps(x_star, sign_mask); | |
224 | 32766 | term = _mm256_or_ps(pi_over_2, term); | |
225 | 32766 | term = _mm256_sub_ps(term, result); | |
226 | 32766 | result = _mm256_blendv_ps(result, term, swap_mask); | |
227 | _mm256_storeu_ps(out, result); | ||
228 | 32766 | in += 8; | |
229 | 32766 | out += 8; | |
230 | } | ||
231 | |||
232 | 2 | number = eighth_points * 8; | |
233 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
234 | 14 | *out++ = volk_arctan(*in++); | |
235 | } | ||
236 | 2 | } | |
237 | #endif /* LV_HAVE_AVX for unaligned */ | ||
238 | |||
239 | #ifdef LV_HAVE_SSE4_1 | ||
240 | #include <smmintrin.h> | ||
241 | #include <volk/volk_sse_intrinsics.h> | ||
242 | static inline void | ||
243 | 2 | volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) | |
244 | { | ||
245 | 2 | const __m128 one = _mm_set1_ps(1.f); | |
246 | 2 | const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f); | |
247 | 4 | const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); | |
248 | 2 | const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); | |
249 | |||
250 | 2 | unsigned int number = 0; | |
251 | 2 | unsigned int quarter_points = num_points / 4; | |
252 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarter_points; number++) { |
253 | 65534 | __m128 x = _mm_loadu_ps(in); | |
254 | 131068 | __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); | |
255 | 131068 | __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), | |
256 | _mm_blendv_ps(one, x, swap_mask)); | ||
257 | 65534 | __m128 result = _mm_arctan_poly_sse(x_star); | |
258 | 65534 | __m128 term = _mm_and_ps(x_star, sign_mask); | |
259 | 65534 | term = _mm_or_ps(pi_over_2, term); | |
260 | 65534 | term = _mm_sub_ps(term, result); | |
261 | 65534 | result = _mm_blendv_ps(result, term, swap_mask); | |
262 | _mm_storeu_ps(out, result); | ||
263 | 65534 | in += 4; | |
264 | 65534 | out += 4; | |
265 | } | ||
266 | |||
267 | 2 | number = quarter_points * 4; | |
268 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
269 | 6 | *out++ = volk_arctan(*in++); | |
270 | } | ||
271 | 2 | } | |
272 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
273 | |||
274 | #ifdef LV_HAVE_GENERIC | ||
275 | static inline void | ||
276 | 2 | volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) | |
277 | { | ||
278 | 2 | unsigned int number = 0; | |
279 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
280 | 262142 | *out++ = volk_arctan(*in++); | |
281 | } | ||
282 | 2 | } | |
283 | #endif /* LV_HAVE_GENERIC */ | ||
284 | |||
285 | #ifdef LV_HAVE_GENERIC | ||
286 | static inline void | ||
287 | 2 | volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) | |
288 | { | ||
289 | 2 | unsigned int number = 0; | |
290 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
291 | 262142 | *out++ = atanf(*in++); | |
292 | } | ||
293 | 2 | } | |
294 | #endif /* LV_HAVE_GENERIC */ | ||
295 | |||
296 | #endif /* INCLUDED_volk_32f_atan_32f_u_H */ | ||
297 |