GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_atan_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 142 142 100.0%
Functions: 8 8 100.0%
Branches: 28 28 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11 /*!
12 * \page volk_32f_atan_32f
13 *
14 * \b Overview
15 *
16 * Computes arcsine of input vector and stores results in output vector.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
21 * \endcode
22 *
23 * \b Inputs
24 * \li aVector: The input vector of floats.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li bVector: The vector where results will be stored.
29 *
30 * \b Example
31 * Calculate common angles around the top half of the unit circle.
32 * \code
33 * int N = 10;
34 * unsigned int alignment = volk_get_alignment();
35 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
36 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
37 *
38 * in[0] = 0.f;
39 * in[1] = 1.f/std::sqrt(3.f);
40 * in[2] = 1.f;
41 * in[3] = std::sqrt(3.f);
42 * in[4] = in[5] = 1e99;
43 * for(unsigned int ii = 6; ii < N; ++ii){
44 * in[ii] = - in[N-ii-1];
45 * }
46 *
47 * volk_32f_atan_32f(out, in, N);
48 *
49 * for(unsigned int ii = 0; ii < N; ++ii){
50 * printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]);
51 * }
52 *
53 * volk_free(in);
54 * volk_free(out);
55 * \endcode
56 */
57 #include <math.h>
58
59 #ifndef INCLUDED_volk_32f_atan_32f_a_H
60 #define INCLUDED_volk_32f_atan_32f_a_H
61
62 #if LV_HAVE_AVX2 && LV_HAVE_FMA
63 #include <immintrin.h>
64 #include <volk/volk_avx2_fma_intrinsics.h>
65 static inline void
66 2 volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
67 {
68 2 const __m256 one = _mm256_set1_ps(1.f);
69 2 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
70 4 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
71 2 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
72
73 2 unsigned int number = 0;
74 2 unsigned int eighth_points = num_points / 8;
75
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighth_points; number++) {
76 32766 __m256 x = _mm256_load_ps(in);
77 32766 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
78 65532 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
79 _mm256_blendv_ps(one, x, swap_mask));
80 32766 __m256 result = _m256_arctan_poly_avx2_fma(x_star);
81 32766 __m256 term = _mm256_and_ps(x_star, sign_mask);
82 32766 term = _mm256_or_ps(pi_over_2, term);
83 32766 term = _mm256_sub_ps(term, result);
84 32766 result = _mm256_blendv_ps(result, term, swap_mask);
85 _mm256_store_ps(out, result);
86 32766 in += 8;
87 32766 out += 8;
88 }
89
90 2 number = eighth_points * 8;
91
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
92 14 *out++ = volk_arctan(*in++);
93 }
94 2 }
95 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
96
97 #if LV_HAVE_AVX
98 #include <immintrin.h>
99 #include <volk/volk_avx_intrinsics.h>
100 static inline void
101 2 volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
102 {
103 2 const __m256 one = _mm256_set1_ps(1.f);
104 2 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
105 4 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
106 2 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
107
108 2 unsigned int number = 0;
109 2 unsigned int eighth_points = num_points / 8;
110
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighth_points; number++) {
111 32766 __m256 x = _mm256_load_ps(in);
112 32766 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
113 65532 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
114 _mm256_blendv_ps(one, x, swap_mask));
115 32766 __m256 result = _m256_arctan_poly_avx(x_star);
116 32766 __m256 term = _mm256_and_ps(x_star, sign_mask);
117 32766 term = _mm256_or_ps(pi_over_2, term);
118 32766 term = _mm256_sub_ps(term, result);
119 32766 result = _mm256_blendv_ps(result, term, swap_mask);
120 _mm256_store_ps(out, result);
121 32766 in += 8;
122 32766 out += 8;
123 }
124
125 2 number = eighth_points * 8;
126
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
127 14 *out++ = volk_arctan(*in++);
128 }
129 2 }
130 #endif /* LV_HAVE_AVX for aligned */
131
132 #ifdef LV_HAVE_SSE4_1
133 #include <smmintrin.h>
134 #include <volk/volk_sse_intrinsics.h>
135 static inline void
136 2 volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
137 {
138 2 const __m128 one = _mm_set1_ps(1.f);
139 2 const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
140 4 const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
141 2 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
142
143 2 unsigned int number = 0;
144 2 unsigned int quarter_points = num_points / 4;
145
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarter_points; number++) {
146 65534 __m128 x = _mm_load_ps(in);
147 131068 __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
148 131068 __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
149 _mm_blendv_ps(one, x, swap_mask));
150 65534 __m128 result = _mm_arctan_poly_sse(x_star);
151 65534 __m128 term = _mm_and_ps(x_star, sign_mask);
152 65534 term = _mm_or_ps(pi_over_2, term);
153 65534 term = _mm_sub_ps(term, result);
154 65534 result = _mm_blendv_ps(result, term, swap_mask);
155 _mm_store_ps(out, result);
156 65534 in += 4;
157 65534 out += 4;
158 }
159
160 2 number = quarter_points * 4;
161
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
162 6 *out++ = volk_arctan(*in++);
163 }
164 2 }
165 #endif /* LV_HAVE_SSE4_1 for aligned */
166 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
167
168 #ifndef INCLUDED_volk_32f_atan_32f_u_H
169 #define INCLUDED_volk_32f_atan_32f_u_H
170
171 #if LV_HAVE_AVX2 && LV_HAVE_FMA
172 #include <immintrin.h>
173 static inline void
174 2 volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
175 {
176 2 const __m256 one = _mm256_set1_ps(1.f);
177 2 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
178 4 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179 2 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180
181 2 unsigned int number = 0;
182 2 unsigned int eighth_points = num_points / 8;
183
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighth_points; number++) {
184 32766 __m256 x = _mm256_loadu_ps(in);
185 32766 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
186 65532 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
187 _mm256_blendv_ps(one, x, swap_mask));
188 32766 __m256 result = _m256_arctan_poly_avx2_fma(x_star);
189 32766 __m256 term = _mm256_and_ps(x_star, sign_mask);
190 32766 term = _mm256_or_ps(pi_over_2, term);
191 32766 term = _mm256_sub_ps(term, result);
192 32766 result = _mm256_blendv_ps(result, term, swap_mask);
193 _mm256_storeu_ps(out, result);
194 32766 in += 8;
195 32766 out += 8;
196 }
197
198 2 number = eighth_points * 8;
199
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
200 14 *out++ = volk_arctan(*in++);
201 }
202 2 }
203 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
204
205 #if LV_HAVE_AVX
206 #include <immintrin.h>
207 static inline void
208 2 volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
209 {
210 2 const __m256 one = _mm256_set1_ps(1.f);
211 2 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
212 4 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
213 2 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
214
215 2 unsigned int number = 0;
216 2 unsigned int eighth_points = num_points / 8;
217
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighth_points; number++) {
218 32766 __m256 x = _mm256_loadu_ps(in);
219 32766 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
220 65532 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
221 _mm256_blendv_ps(one, x, swap_mask));
222 32766 __m256 result = _m256_arctan_poly_avx(x_star);
223 32766 __m256 term = _mm256_and_ps(x_star, sign_mask);
224 32766 term = _mm256_or_ps(pi_over_2, term);
225 32766 term = _mm256_sub_ps(term, result);
226 32766 result = _mm256_blendv_ps(result, term, swap_mask);
227 _mm256_storeu_ps(out, result);
228 32766 in += 8;
229 32766 out += 8;
230 }
231
232 2 number = eighth_points * 8;
233
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
234 14 *out++ = volk_arctan(*in++);
235 }
236 2 }
237 #endif /* LV_HAVE_AVX for unaligned */
238
239 #ifdef LV_HAVE_SSE4_1
240 #include <smmintrin.h>
241 #include <volk/volk_sse_intrinsics.h>
242 static inline void
243 2 volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
244 {
245 2 const __m128 one = _mm_set1_ps(1.f);
246 2 const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
247 4 const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
248 2 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
249
250 2 unsigned int number = 0;
251 2 unsigned int quarter_points = num_points / 4;
252
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarter_points; number++) {
253 65534 __m128 x = _mm_loadu_ps(in);
254 131068 __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
255 131068 __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
256 _mm_blendv_ps(one, x, swap_mask));
257 65534 __m128 result = _mm_arctan_poly_sse(x_star);
258 65534 __m128 term = _mm_and_ps(x_star, sign_mask);
259 65534 term = _mm_or_ps(pi_over_2, term);
260 65534 term = _mm_sub_ps(term, result);
261 65534 result = _mm_blendv_ps(result, term, swap_mask);
262 _mm_storeu_ps(out, result);
263 65534 in += 4;
264 65534 out += 4;
265 }
266
267 2 number = quarter_points * 4;
268
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
269 6 *out++ = volk_arctan(*in++);
270 }
271 2 }
272 #endif /* LV_HAVE_SSE4_1 for unaligned */
273
274 #ifdef LV_HAVE_GENERIC
275 static inline void
276 2 volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
277 {
278 2 unsigned int number = 0;
279
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; number < num_points; number++) {
280 262142 *out++ = volk_arctan(*in++);
281 }
282 2 }
283 #endif /* LV_HAVE_GENERIC */
284
285 #ifdef LV_HAVE_GENERIC
286 static inline void
287 2 volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
288 {
289 2 unsigned int number = 0;
290
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; number < num_points; number++) {
291 262142 *out++ = atanf(*in++);
292 }
293 2 }
294 #endif /* LV_HAVE_GENERIC */
295
296 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
297