| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2015 Free Software Foundation, Inc. | ||
| 4 | * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> | ||
| 5 | * | ||
| 6 | * This file is part of VOLK | ||
| 7 | * | ||
| 8 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 9 | */ | ||
| 10 | |||
| 11 | /* | ||
| 12 | * This file is intended to hold AVX intrinsics of intrinsics. | ||
| 13 | * They should be used in VOLK kernels to avoid copy-pasta. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ | ||
| 17 | #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ | ||
| 18 | #include <immintrin.h> | ||
| 19 | |||
| 20 | /* | ||
| 21 | * Approximate arctan(x) via polynomial expansion | ||
| 22 | * on the interval [-1, 1] | ||
| 23 | * | ||
| 24 | * Maximum relative error ~6.5e-7 | ||
| 25 | * Polynomial evaluated via Horner's method | ||
| 26 | */ | ||
| 27 | 65532 | static inline __m256 _m256_arctan_poly_avx(const __m256 x) | |
| 28 | { | ||
| 29 | 65532 | const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); | |
| 30 | 65532 | const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); | |
| 31 | 65532 | const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); | |
| 32 | 65532 | const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); | |
| 33 | 65532 | const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); | |
| 34 | 65532 | const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); | |
| 35 | 65532 | const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); | |
| 36 | |||
| 37 | 65532 | const __m256 x_times_x = _mm256_mul_ps(x, x); | |
| 38 | __m256 arctan; | ||
| 39 | 65532 | arctan = a13; | |
| 40 | 65532 | arctan = _mm256_mul_ps(x_times_x, arctan); | |
| 41 | 65532 | arctan = _mm256_add_ps(arctan, a11); | |
| 42 | 65532 | arctan = _mm256_mul_ps(x_times_x, arctan); | |
| 43 | 65532 | arctan = _mm256_add_ps(arctan, a9); | |
| 44 | 65532 | arctan = _mm256_mul_ps(x_times_x, arctan); | |
| 45 | 65532 | arctan = _mm256_add_ps(arctan, a7); | |
| 46 | 65532 | arctan = _mm256_mul_ps(x_times_x, arctan); | |
| 47 | 65532 | arctan = _mm256_add_ps(arctan, a5); | |
| 48 | 65532 | arctan = _mm256_mul_ps(x_times_x, arctan); | |
| 49 | 65532 | arctan = _mm256_add_ps(arctan, a3); | |
| 50 | 65532 | arctan = _mm256_mul_ps(x_times_x, arctan); | |
| 51 | 65532 | arctan = _mm256_add_ps(arctan, a1); | |
| 52 | 65532 | arctan = _mm256_mul_ps(x, arctan); | |
| 53 | |||
| 54 | 65532 | return arctan; | |
| 55 | } | ||
| 56 | |||
| 57 | 393204 | static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) | |
| 58 | { | ||
| 59 | __m256 yl, yh, tmp1, tmp2; | ||
| 60 | 393204 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... | |
| 61 | 393204 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... | |
| 62 | 393204 | tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... | |
| 63 | 393204 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... | |
| 64 | 393204 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
| 65 | |||
| 66 | // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 67 | 393204 | return _mm256_addsub_ps(tmp1, tmp2); | |
| 68 | } | ||
| 69 | |||
| 70 | static inline __m256 _mm256_conjugate_ps(__m256 x) | ||
| 71 | { | ||
| 72 | const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); | ||
| 73 | return _mm256_xor_ps(x, conjugator); // conjugate y | ||
| 74 | } | ||
| 75 | |||
| 76 | 393202 | static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y) | |
| 77 | { | ||
| 78 | 393202 | const __m256 nswap = _mm256_permute_ps(x, 0xb1); | |
| 79 | 393202 | const __m256 dreal = _mm256_moveldup_ps(y); | |
| 80 | 393202 | const __m256 dimag = _mm256_movehdup_ps(y); | |
| 81 | |||
| 82 | 393202 | const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); | |
| 83 | 393202 | const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator); | |
| 84 | 393202 | const __m256 multreal = _mm256_mul_ps(x, dreal); | |
| 85 | 393202 | const __m256 multimag = _mm256_mul_ps(nswap, dimagconj); | |
| 86 | 393202 | return _mm256_add_ps(multreal, multimag); | |
| 87 | } | ||
| 88 | |||
| 89 | 1024 | static inline __m256 _mm256_normalize_ps(__m256 val) | |
| 90 | { | ||
| 91 | 1024 | __m256 tmp1 = _mm256_mul_ps(val, val); | |
| 92 | 1024 | tmp1 = _mm256_hadd_ps(tmp1, tmp1); | |
| 93 | 1024 | tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 | |
| 94 | 1024 | tmp1 = _mm256_sqrt_ps(tmp1); | |
| 95 | 1024 | return _mm256_div_ps(val, tmp1); | |
| 96 | } | ||
| 97 | |||
| 98 | 229362 | static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) | |
| 99 | { | ||
| 100 | __m256 complex1, complex2; | ||
| 101 | 229362 | cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values | |
| 102 | 229362 | cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values | |
| 103 | 229362 | complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); | |
| 104 | 229362 | complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); | |
| 105 | 229362 | return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values | |
| 106 | } | ||
| 107 | |||
| 108 | 65532 | static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) | |
| 109 | { | ||
| 110 | 131064 | return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); | |
| 111 | } | ||
| 112 | |||
| 113 | 65532 | static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, | |
| 114 | const __m256 symbols1, | ||
| 115 | const __m256 points0, | ||
| 116 | const __m256 points1, | ||
| 117 | const __m256 scalar) | ||
| 118 | { | ||
| 119 | /* | ||
| 120 | * Calculate: |y - x|^2 * SNR_lin | ||
| 121 | * Consider 'symbolsX' and 'pointsX' to be complex float | ||
| 122 | * 'symbolsX' are 'y' and 'pointsX' are 'x' | ||
| 123 | */ | ||
| 124 | 65532 | const __m256 diff0 = _mm256_sub_ps(symbols0, points0); | |
| 125 | 65532 | const __m256 diff1 = _mm256_sub_ps(symbols1, points1); | |
| 126 | 65532 | const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); | |
| 127 | 65532 | return _mm256_mul_ps(norms, scalar); | |
| 128 | } | ||
| 129 | |||
| 130 | 4608 | static inline __m256 _mm256_polar_sign_mask(__m128i fbits) | |
| 131 | { | ||
| 132 | 4608 | __m256 sign_mask_dummy = _mm256_setzero_ps(); | |
| 133 | 4608 | const __m128i zeros = _mm_set1_epi8(0x00); | |
| 134 | 4608 | const __m128i sign_extract = _mm_set1_epi8(0x80); | |
| 135 | 4608 | const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, | |
| 136 | 0xff, | ||
| 137 | 0xff, | ||
| 138 | 0x00, | ||
| 139 | 0xff, | ||
| 140 | 0xff, | ||
| 141 | 0xff, | ||
| 142 | 0x01, | ||
| 143 | 0xff, | ||
| 144 | 0xff, | ||
| 145 | 0xff, | ||
| 146 | 0x02, | ||
| 147 | 0xff, | ||
| 148 | 0xff, | ||
| 149 | 0xff, | ||
| 150 | 0x03); | ||
| 151 | 4608 | const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, | |
| 152 | 0xff, | ||
| 153 | 0xff, | ||
| 154 | 0x04, | ||
| 155 | 0xff, | ||
| 156 | 0xff, | ||
| 157 | 0xff, | ||
| 158 | 0x05, | ||
| 159 | 0xff, | ||
| 160 | 0xff, | ||
| 161 | 0xff, | ||
| 162 | 0x06, | ||
| 163 | 0xff, | ||
| 164 | 0xff, | ||
| 165 | 0xff, | ||
| 166 | 0x07); | ||
| 167 | |||
| 168 | 4608 | fbits = _mm_cmpgt_epi8(fbits, zeros); | |
| 169 | 4608 | fbits = _mm_and_si128(fbits, sign_extract); | |
| 170 | 4608 | __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); | |
| 171 | 4608 | __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); | |
| 172 | |||
| 173 | __m256 sign_mask = | ||
| 174 | 4608 | _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); | |
| 175 | 4608 | return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); | |
| 176 | // // This is the desired function call. Though it seems to be missing in GCC. | ||
| 177 | // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# | ||
| 178 | // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), | ||
| 179 | // _mm_castsi128_ps(sign_bits0)); | ||
| 180 | } | ||
| 181 | |||
| 182 | static inline void | ||
| 183 | 18432 | _mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1) | |
| 184 | { | ||
| 185 | // deinterleave values | ||
| 186 | 18432 | __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20); | |
| 187 | 18432 | __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31); | |
| 188 | 18432 | *llr0 = _mm256_shuffle_ps(part0, part1, 0x88); | |
| 189 | 18432 | *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); | |
| 190 | 18432 | } | |
| 191 | |||
| 192 | 9216 | static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1) | |
| 193 | { | ||
| 194 | 9216 | const __m256 sign_mask = _mm256_set1_ps(-0.0f); | |
| 195 | const __m256 abs_mask = | ||
| 196 | 18432 | _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); | |
| 197 | |||
| 198 | __m256 llr0, llr1; | ||
| 199 | 9216 | _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); | |
| 200 | |||
| 201 | // calculate result | ||
| 202 | __m256 sign = | ||
| 203 | 27648 | _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); | |
| 204 | __m256 dst = | ||
| 205 | 36864 | _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); | |
| 206 | 9216 | return _mm256_or_ps(dst, sign); | |
| 207 | } | ||
| 208 | |||
| 209 | 4608 | static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits) | |
| 210 | { | ||
| 211 | // prepare sign mask for correct +- | ||
| 212 | 4608 | __m256 sign_mask = _mm256_polar_sign_mask(fbits); | |
| 213 | |||
| 214 | __m256 llr0, llr1; | ||
| 215 | 4608 | _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); | |
| 216 | |||
| 217 | // calculate result | ||
| 218 | 4608 | llr0 = _mm256_xor_ps(llr0, sign_mask); | |
| 219 | 4608 | __m256 dst = _mm256_add_ps(llr0, llr1); | |
| 220 | 4608 | return dst; | |
| 221 | } | ||
| 222 | |||
| 223 | 65520 | static inline __m256 _mm256_accumulate_square_sum_ps( | |
| 224 | __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux) | ||
| 225 | { | ||
| 226 | 65520 | aux = _mm256_mul_ps(aux, val); | |
| 227 | 65520 | aux = _mm256_sub_ps(aux, acc); | |
| 228 | 65520 | aux = _mm256_mul_ps(aux, aux); | |
| 229 | 65520 | aux = _mm256_mul_ps(aux, rec); | |
| 230 | 65520 | return _mm256_add_ps(sq_acc, aux); | |
| 231 | } | ||
| 232 | |||
| 233 | #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */ | ||
| 234 |