| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2015 Free Software Foundation, Inc. | ||
| 4 | * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> | ||
| 5 | * | ||
| 6 | * This file is part of VOLK | ||
| 7 | * | ||
| 8 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 9 | */ | ||
| 10 | |||
| 11 | /* | ||
| 12 | * This file is intended to hold SSE intrinsics of intrinsics. | ||
| 13 | * They should be used in VOLK kernels to avoid copy-pasta. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ | ||
| 17 | #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ | ||
| 18 | #include <xmmintrin.h> | ||
| 19 | |||
| 20 | /* | ||
| 21 | * Approximate arctan(x) via polynomial expansion | ||
| 22 | * on the interval [-1, 1] | ||
| 23 | * | ||
| 24 | * Maximum relative error ~6.5e-7 | ||
| 25 | * Polynomial evaluated via Horner's method | ||
| 26 | */ | ||
| 27 | 131068 | static inline __m128 _mm_arctan_poly_sse(const __m128 x) | |
| 28 | { | ||
| 29 | 131068 | const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f); | |
| 30 | 131068 | const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f); | |
| 31 | 131068 | const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f); | |
| 32 | 131068 | const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f); | |
| 33 | 131068 | const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f); | |
| 34 | 131068 | const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f); | |
| 35 | 131068 | const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f); | |
| 36 | |||
| 37 | 131068 | const __m128 x_times_x = _mm_mul_ps(x, x); | |
| 38 | __m128 arctan; | ||
| 39 | 131068 | arctan = a13; | |
| 40 | 131068 | arctan = _mm_mul_ps(x_times_x, arctan); | |
| 41 | 131068 | arctan = _mm_add_ps(arctan, a11); | |
| 42 | 131068 | arctan = _mm_mul_ps(x_times_x, arctan); | |
| 43 | 131068 | arctan = _mm_add_ps(arctan, a9); | |
| 44 | 131068 | arctan = _mm_mul_ps(x_times_x, arctan); | |
| 45 | 131068 | arctan = _mm_add_ps(arctan, a7); | |
| 46 | 131068 | arctan = _mm_mul_ps(x_times_x, arctan); | |
| 47 | 131068 | arctan = _mm_add_ps(arctan, a5); | |
| 48 | 131068 | arctan = _mm_mul_ps(x_times_x, arctan); | |
| 49 | 131068 | arctan = _mm_add_ps(arctan, a3); | |
| 50 | 131068 | arctan = _mm_mul_ps(x_times_x, arctan); | |
| 51 | 131068 | arctan = _mm_add_ps(arctan, a1); | |
| 52 | 131068 | arctan = _mm_mul_ps(x, arctan); | |
| 53 | |||
| 54 | 131068 | return arctan; | |
| 55 | } | ||
| 56 | |||
| 57 | 393204 | static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) | |
| 58 | { | ||
| 59 | __m128 iValue, qValue; | ||
| 60 | // Arrange in i1i2i3i4 format | ||
| 61 | 393204 | iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); | |
| 62 | // Arrange in q1q2q3q4 format | ||
| 63 | 393204 | qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); | |
| 64 | 393204 | iValue = _mm_mul_ps(iValue, iValue); // Square the I values | |
| 65 | 393204 | qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values | |
| 66 | 393204 | return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values | |
| 67 | } | ||
| 68 | |||
| 69 | 131068 | static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) | |
| 70 | { | ||
| 71 | 262136 | return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); | |
| 72 | } | ||
| 73 | |||
| 74 | 131068 | static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, | |
| 75 | const __m128 symbols1, | ||
| 76 | const __m128 points0, | ||
| 77 | const __m128 points1, | ||
| 78 | const __m128 scalar) | ||
| 79 | { | ||
| 80 | // calculate scalar * |x - y|^2 | ||
| 81 | 131068 | const __m128 diff0 = _mm_sub_ps(symbols0, points0); | |
| 82 | 131068 | const __m128 diff1 = _mm_sub_ps(symbols1, points1); | |
| 83 | 131068 | const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); | |
| 84 | 131068 | return _mm_mul_ps(norms, scalar); | |
| 85 | } | ||
| 86 | |||
| 87 | 131056 | static inline __m128 _mm_accumulate_square_sum_ps( | |
| 88 | __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux) | ||
| 89 | { | ||
| 90 | 131056 | aux = _mm_mul_ps(aux, val); | |
| 91 | 131056 | aux = _mm_sub_ps(aux, acc); | |
| 92 | 131056 | aux = _mm_mul_ps(aux, aux); | |
| 93 | 131056 | aux = _mm_mul_ps(aux, rec); | |
| 94 | 131056 | return _mm_add_ps(sq_acc, aux); | |
| 95 | } | ||
| 96 | |||
| 97 | #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */ | ||
| 98 |