Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2015 Free Software Foundation, Inc. |
4 |
|
|
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> |
5 |
|
|
* |
6 |
|
|
* This file is part of VOLK |
7 |
|
|
* |
8 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
9 |
|
|
*/ |
10 |
|
|
|
11 |
|
|
/* |
12 |
|
|
* This file is intended to hold AVX intrinsics of intrinsics. |
13 |
|
|
* They should be used in VOLK kernels to avoid copy-pasta. |
14 |
|
|
*/ |
15 |
|
|
|
16 |
|
|
#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ |
17 |
|
|
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ |
18 |
|
|
#include <immintrin.h> |
19 |
|
|
|
20 |
|
|
/* |
21 |
|
|
* Approximate arctan(x) via polynomial expansion |
22 |
|
|
* on the interval [-1, 1] |
23 |
|
|
* |
24 |
|
|
* Maximum relative error ~6.5e-7 |
25 |
|
|
* Polynomial evaluated via Horner's method |
26 |
|
|
*/ |
27 |
|
65532 |
static inline __m256 _m256_arctan_poly_avx(const __m256 x) |
28 |
|
|
{ |
29 |
|
65532 |
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); |
30 |
|
65532 |
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); |
31 |
|
65532 |
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); |
32 |
|
65532 |
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); |
33 |
|
65532 |
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); |
34 |
|
65532 |
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); |
35 |
|
65532 |
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); |
36 |
|
|
|
37 |
|
65532 |
const __m256 x_times_x = _mm256_mul_ps(x, x); |
38 |
|
|
__m256 arctan; |
39 |
|
65532 |
arctan = a13; |
40 |
|
65532 |
arctan = _mm256_mul_ps(x_times_x, arctan); |
41 |
|
65532 |
arctan = _mm256_add_ps(arctan, a11); |
42 |
|
65532 |
arctan = _mm256_mul_ps(x_times_x, arctan); |
43 |
|
65532 |
arctan = _mm256_add_ps(arctan, a9); |
44 |
|
65532 |
arctan = _mm256_mul_ps(x_times_x, arctan); |
45 |
|
65532 |
arctan = _mm256_add_ps(arctan, a7); |
46 |
|
65532 |
arctan = _mm256_mul_ps(x_times_x, arctan); |
47 |
|
65532 |
arctan = _mm256_add_ps(arctan, a5); |
48 |
|
65532 |
arctan = _mm256_mul_ps(x_times_x, arctan); |
49 |
|
65532 |
arctan = _mm256_add_ps(arctan, a3); |
50 |
|
65532 |
arctan = _mm256_mul_ps(x_times_x, arctan); |
51 |
|
65532 |
arctan = _mm256_add_ps(arctan, a1); |
52 |
|
65532 |
arctan = _mm256_mul_ps(x, arctan); |
53 |
|
|
|
54 |
|
65532 |
return arctan; |
55 |
|
|
} |
56 |
|
|
|
57 |
|
393204 |
static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) |
58 |
|
|
{ |
59 |
|
|
__m256 yl, yh, tmp1, tmp2; |
60 |
|
393204 |
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... |
61 |
|
393204 |
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... |
62 |
|
393204 |
tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... |
63 |
|
393204 |
x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... |
64 |
|
393204 |
tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |
65 |
|
|
|
66 |
|
|
// ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |
67 |
|
393204 |
return _mm256_addsub_ps(tmp1, tmp2); |
68 |
|
|
} |
69 |
|
|
|
70 |
|
|
static inline __m256 _mm256_conjugate_ps(__m256 x) |
71 |
|
|
{ |
72 |
|
|
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); |
73 |
|
|
return _mm256_xor_ps(x, conjugator); // conjugate y |
74 |
|
|
} |
75 |
|
|
|
76 |
|
393202 |
static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y) |
77 |
|
|
{ |
78 |
|
393202 |
const __m256 nswap = _mm256_permute_ps(x, 0xb1); |
79 |
|
393202 |
const __m256 dreal = _mm256_moveldup_ps(y); |
80 |
|
393202 |
const __m256 dimag = _mm256_movehdup_ps(y); |
81 |
|
|
|
82 |
|
393202 |
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); |
83 |
|
393202 |
const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator); |
84 |
|
393202 |
const __m256 multreal = _mm256_mul_ps(x, dreal); |
85 |
|
393202 |
const __m256 multimag = _mm256_mul_ps(nswap, dimagconj); |
86 |
|
393202 |
return _mm256_add_ps(multreal, multimag); |
87 |
|
|
} |
88 |
|
|
|
89 |
|
1024 |
static inline __m256 _mm256_normalize_ps(__m256 val) |
90 |
|
|
{ |
91 |
|
1024 |
__m256 tmp1 = _mm256_mul_ps(val, val); |
92 |
|
1024 |
tmp1 = _mm256_hadd_ps(tmp1, tmp1); |
93 |
|
1024 |
tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 |
94 |
|
1024 |
tmp1 = _mm256_sqrt_ps(tmp1); |
95 |
|
1024 |
return _mm256_div_ps(val, tmp1); |
96 |
|
|
} |
97 |
|
|
|
98 |
|
229362 |
static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) |
99 |
|
|
{ |
100 |
|
|
__m256 complex1, complex2; |
101 |
|
229362 |
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values |
102 |
|
229362 |
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values |
103 |
|
229362 |
complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); |
104 |
|
229362 |
complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); |
105 |
|
229362 |
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values |
106 |
|
|
} |
107 |
|
|
|
108 |
|
65532 |
static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) |
109 |
|
|
{ |
110 |
|
131064 |
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); |
111 |
|
|
} |
112 |
|
|
|
113 |
|
65532 |
static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, |
114 |
|
|
const __m256 symbols1, |
115 |
|
|
const __m256 points0, |
116 |
|
|
const __m256 points1, |
117 |
|
|
const __m256 scalar) |
118 |
|
|
{ |
119 |
|
|
/* |
120 |
|
|
* Calculate: |y - x|^2 * SNR_lin |
121 |
|
|
* Consider 'symbolsX' and 'pointsX' to be complex float |
122 |
|
|
* 'symbolsX' are 'y' and 'pointsX' are 'x' |
123 |
|
|
*/ |
124 |
|
65532 |
const __m256 diff0 = _mm256_sub_ps(symbols0, points0); |
125 |
|
65532 |
const __m256 diff1 = _mm256_sub_ps(symbols1, points1); |
126 |
|
65532 |
const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); |
127 |
|
65532 |
return _mm256_mul_ps(norms, scalar); |
128 |
|
|
} |
129 |
|
|
|
130 |
|
4608 |
static inline __m256 _mm256_polar_sign_mask(__m128i fbits) |
131 |
|
|
{ |
132 |
|
4608 |
__m256 sign_mask_dummy = _mm256_setzero_ps(); |
133 |
|
4608 |
const __m128i zeros = _mm_set1_epi8(0x00); |
134 |
|
4608 |
const __m128i sign_extract = _mm_set1_epi8(0x80); |
135 |
|
4608 |
const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, |
136 |
|
|
0xff, |
137 |
|
|
0xff, |
138 |
|
|
0x00, |
139 |
|
|
0xff, |
140 |
|
|
0xff, |
141 |
|
|
0xff, |
142 |
|
|
0x01, |
143 |
|
|
0xff, |
144 |
|
|
0xff, |
145 |
|
|
0xff, |
146 |
|
|
0x02, |
147 |
|
|
0xff, |
148 |
|
|
0xff, |
149 |
|
|
0xff, |
150 |
|
|
0x03); |
151 |
|
4608 |
const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, |
152 |
|
|
0xff, |
153 |
|
|
0xff, |
154 |
|
|
0x04, |
155 |
|
|
0xff, |
156 |
|
|
0xff, |
157 |
|
|
0xff, |
158 |
|
|
0x05, |
159 |
|
|
0xff, |
160 |
|
|
0xff, |
161 |
|
|
0xff, |
162 |
|
|
0x06, |
163 |
|
|
0xff, |
164 |
|
|
0xff, |
165 |
|
|
0xff, |
166 |
|
|
0x07); |
167 |
|
|
|
168 |
|
4608 |
fbits = _mm_cmpgt_epi8(fbits, zeros); |
169 |
|
4608 |
fbits = _mm_and_si128(fbits, sign_extract); |
170 |
|
4608 |
__m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); |
171 |
|
4608 |
__m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); |
172 |
|
|
|
173 |
|
|
__m256 sign_mask = |
174 |
|
4608 |
_mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); |
175 |
|
4608 |
return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); |
176 |
|
|
// // This is the desired function call. Though it seems to be missing in GCC. |
177 |
|
|
// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# |
178 |
|
|
// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), |
179 |
|
|
// _mm_castsi128_ps(sign_bits0)); |
180 |
|
|
} |
181 |
|
|
|
182 |
|
|
static inline void |
183 |
|
18432 |
_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1) |
184 |
|
|
{ |
185 |
|
|
// deinterleave values |
186 |
|
18432 |
__m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20); |
187 |
|
18432 |
__m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31); |
188 |
|
18432 |
*llr0 = _mm256_shuffle_ps(part0, part1, 0x88); |
189 |
|
18432 |
*llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); |
190 |
|
18432 |
} |
191 |
|
|
|
192 |
|
9216 |
static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1) |
193 |
|
|
{ |
194 |
|
9216 |
const __m256 sign_mask = _mm256_set1_ps(-0.0f); |
195 |
|
|
const __m256 abs_mask = |
196 |
|
18432 |
_mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); |
197 |
|
|
|
198 |
|
|
__m256 llr0, llr1; |
199 |
|
9216 |
_mm256_polar_deinterleave(&llr0, &llr1, src0, src1); |
200 |
|
|
|
201 |
|
|
// calculate result |
202 |
|
|
__m256 sign = |
203 |
|
27648 |
_mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); |
204 |
|
|
__m256 dst = |
205 |
|
36864 |
_mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); |
206 |
|
9216 |
return _mm256_or_ps(dst, sign); |
207 |
|
|
} |
208 |
|
|
|
209 |
|
4608 |
static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits) |
210 |
|
|
{ |
211 |
|
|
// prepare sign mask for correct +- |
212 |
|
4608 |
__m256 sign_mask = _mm256_polar_sign_mask(fbits); |
213 |
|
|
|
214 |
|
|
__m256 llr0, llr1; |
215 |
|
4608 |
_mm256_polar_deinterleave(&llr0, &llr1, src0, src1); |
216 |
|
|
|
217 |
|
|
// calculate result |
218 |
|
4608 |
llr0 = _mm256_xor_ps(llr0, sign_mask); |
219 |
|
4608 |
__m256 dst = _mm256_add_ps(llr0, llr1); |
220 |
|
4608 |
return dst; |
221 |
|
|
} |
222 |
|
|
|
223 |
|
65520 |
static inline __m256 _mm256_accumulate_square_sum_ps( |
224 |
|
|
__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux) |
225 |
|
|
{ |
226 |
|
65520 |
aux = _mm256_mul_ps(aux, val); |
227 |
|
65520 |
aux = _mm256_sub_ps(aux, acc); |
228 |
|
65520 |
aux = _mm256_mul_ps(aux, aux); |
229 |
|
65520 |
aux = _mm256_mul_ps(aux, rec); |
230 |
|
65520 |
return _mm256_add_ps(sq_acc, aux); |
231 |
|
|
} |
232 |
|
|
|
233 |
|
|
#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */ |
234 |
|
|
|