| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /* | ||
| 11 | * This file is intended to hold AVX2 FMA intrinsics of intrinsics. | ||
| 12 | * They should be used in VOLK kernels to avoid copy-paste. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ | ||
| 16 | #define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ | ||
| 17 | #include <immintrin.h> | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Approximate arctan(x) via polynomial expansion | ||
| 21 | * on the interval [-1, 1] | ||
| 22 | * | ||
| 23 | * Maximum relative error ~6.5e-7 | ||
| 24 | * Polynomial evaluated via Horner's method | ||
| 25 | */ | ||
| 26 | 65532 | static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) | |
| 27 | { | ||
| 28 | 65532 | const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); | |
| 29 | 65532 | const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); | |
| 30 | 65532 | const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); | |
| 31 | 65532 | const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); | |
| 32 | 65532 | const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); | |
| 33 | 65532 | const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); | |
| 34 | 65532 | const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); | |
| 35 | |||
| 36 | 65532 | const __m256 x_times_x = _mm256_mul_ps(x, x); | |
| 37 | __m256 arctan; | ||
| 38 | 65532 | arctan = a13; | |
| 39 | 65532 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a11); | |
| 40 | 65532 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a9); | |
| 41 | 65532 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a7); | |
| 42 | 65532 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a5); | |
| 43 | 65532 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a3); | |
| 44 | 65532 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a1); | |
| 45 | 65532 | arctan = _mm256_mul_ps(x, arctan); | |
| 46 | |||
| 47 | 65532 | return arctan; | |
| 48 | } | ||
| 49 | |||
| 50 | #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */ | ||
| 51 |