Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> |
4 |
|
|
* |
5 |
|
|
* This file is part of VOLK |
6 |
|
|
* |
7 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
/* |
11 |
|
|
* This file is intended to hold AVX2 FMA intrinsics of intrinsics. |
12 |
|
|
* They should be used in VOLK kernels to avoid copy-paste. |
13 |
|
|
*/ |
14 |
|
|
|
15 |
|
|
#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ |
16 |
|
|
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ |
17 |
|
|
#include <immintrin.h> |
18 |
|
|
|
19 |
|
|
/* |
20 |
|
|
* Approximate arctan(x) via polynomial expansion |
21 |
|
|
* on the interval [-1, 1] |
22 |
|
|
* |
23 |
|
|
* Maximum relative error ~6.5e-7 |
24 |
|
|
* Polynomial evaluated via Horner's method |
25 |
|
|
*/ |
26 |
|
65532 |
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) |
27 |
|
|
{ |
28 |
|
65532 |
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); |
29 |
|
65532 |
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); |
30 |
|
65532 |
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); |
31 |
|
65532 |
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); |
32 |
|
65532 |
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); |
33 |
|
65532 |
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); |
34 |
|
65532 |
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); |
35 |
|
|
|
36 |
|
65532 |
const __m256 x_times_x = _mm256_mul_ps(x, x); |
37 |
|
|
__m256 arctan; |
38 |
|
65532 |
arctan = a13; |
39 |
|
65532 |
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11); |
40 |
|
65532 |
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9); |
41 |
|
65532 |
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7); |
42 |
|
65532 |
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5); |
43 |
|
65532 |
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3); |
44 |
|
65532 |
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1); |
45 |
|
65532 |
arctan = _mm256_mul_ps(x, arctan); |
46 |
|
|
|
47 |
|
65532 |
return arctan; |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */ |
51 |
|
|
|