GCC Code Coverage Report


Directory: ./
File: include/volk/volk_avx_intrinsics.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 96 96 100.0%
Functions: 12 12 100.0%
Branches: 0 0 -%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11 /*
12 * This file is intended to hold AVX intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-pasta.
14 */
15
16 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
18 #include <immintrin.h>
19
20 /*
21 * Approximate arctan(x) via polynomial expansion
22 * on the interval [-1, 1]
23 *
24 * Maximum relative error ~6.5e-7
25 * Polynomial evaluated via Horner's method
26 */
27 65532 static inline __m256 _m256_arctan_poly_avx(const __m256 x)
28 {
29 65532 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30 65532 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31 65532 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32 65532 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33 65532 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34 65532 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35 65532 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
36
37 65532 const __m256 x_times_x = _mm256_mul_ps(x, x);
38 __m256 arctan;
39 65532 arctan = a13;
40 65532 arctan = _mm256_mul_ps(x_times_x, arctan);
41 65532 arctan = _mm256_add_ps(arctan, a11);
42 65532 arctan = _mm256_mul_ps(x_times_x, arctan);
43 65532 arctan = _mm256_add_ps(arctan, a9);
44 65532 arctan = _mm256_mul_ps(x_times_x, arctan);
45 65532 arctan = _mm256_add_ps(arctan, a7);
46 65532 arctan = _mm256_mul_ps(x_times_x, arctan);
47 65532 arctan = _mm256_add_ps(arctan, a5);
48 65532 arctan = _mm256_mul_ps(x_times_x, arctan);
49 65532 arctan = _mm256_add_ps(arctan, a3);
50 65532 arctan = _mm256_mul_ps(x_times_x, arctan);
51 65532 arctan = _mm256_add_ps(arctan, a1);
52 65532 arctan = _mm256_mul_ps(x, arctan);
53
54 65532 return arctan;
55 }
56
57 393204 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
58 {
59 __m256 yl, yh, tmp1, tmp2;
60 393204 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
61 393204 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
62 393204 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
63 393204 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
64 393204 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
65
66 // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
67 393204 return _mm256_addsub_ps(tmp1, tmp2);
68 }
69
70 static inline __m256 _mm256_conjugate_ps(__m256 x)
71 {
72 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73 return _mm256_xor_ps(x, conjugator); // conjugate y
74 }
75
76 393202 static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
77 {
78 393202 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79 393202 const __m256 dreal = _mm256_moveldup_ps(y);
80 393202 const __m256 dimag = _mm256_movehdup_ps(y);
81
82 393202 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83 393202 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84 393202 const __m256 multreal = _mm256_mul_ps(x, dreal);
85 393202 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86 393202 return _mm256_add_ps(multreal, multimag);
87 }
88
89 1024 static inline __m256 _mm256_normalize_ps(__m256 val)
90 {
91 1024 __m256 tmp1 = _mm256_mul_ps(val, val);
92 1024 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93 1024 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
94 1024 tmp1 = _mm256_sqrt_ps(tmp1);
95 1024 return _mm256_div_ps(val, tmp1);
96 }
97
98 229362 static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
99 {
100 __m256 complex1, complex2;
101 229362 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
102 229362 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
103 229362 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104 229362 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105 229362 return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
106 }
107
108 65532 static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
109 {
110 131064 return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
111 }
112
113 65532 static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
114 const __m256 symbols1,
115 const __m256 points0,
116 const __m256 points1,
117 const __m256 scalar)
118 {
119 /*
120 * Calculate: |y - x|^2 * SNR_lin
121 * Consider 'symbolsX' and 'pointsX' to be complex float
122 * 'symbolsX' are 'y' and 'pointsX' are 'x'
123 */
124 65532 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125 65532 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
126 65532 const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
127 65532 return _mm256_mul_ps(norms, scalar);
128 }
129
130 4608 static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
131 {
132 4608 __m256 sign_mask_dummy = _mm256_setzero_ps();
133 4608 const __m128i zeros = _mm_set1_epi8(0x00);
134 4608 const __m128i sign_extract = _mm_set1_epi8(0x80);
135 4608 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
136 0xff,
137 0xff,
138 0x00,
139 0xff,
140 0xff,
141 0xff,
142 0x01,
143 0xff,
144 0xff,
145 0xff,
146 0x02,
147 0xff,
148 0xff,
149 0xff,
150 0x03);
151 4608 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
152 0xff,
153 0xff,
154 0x04,
155 0xff,
156 0xff,
157 0xff,
158 0x05,
159 0xff,
160 0xff,
161 0xff,
162 0x06,
163 0xff,
164 0xff,
165 0xff,
166 0x07);
167
168 4608 fbits = _mm_cmpgt_epi8(fbits, zeros);
169 4608 fbits = _mm_and_si128(fbits, sign_extract);
170 4608 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
171 4608 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
172
173 __m256 sign_mask =
174 4608 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
175 4608 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
176 // // This is the desired function call. Though it seems to be missing in GCC.
177 // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
178 // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
179 // _mm_castsi128_ps(sign_bits0));
180 }
181
182 static inline void
183 18432 _mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
184 {
185 // deinterleave values
186 18432 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187 18432 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188 18432 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189 18432 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
190 18432 }
191
192 9216 static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
193 {
194 9216 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195 const __m256 abs_mask =
196 18432 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
197
198 __m256 llr0, llr1;
199 9216 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
200
201 // calculate result
202 __m256 sign =
203 27648 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
204 __m256 dst =
205 36864 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206 9216 return _mm256_or_ps(dst, sign);
207 }
208
209 4608 static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
210 {
211 // prepare sign mask for correct +-
212 4608 __m256 sign_mask = _mm256_polar_sign_mask(fbits);
213
214 __m256 llr0, llr1;
215 4608 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
216
217 // calculate result
218 4608 llr0 = _mm256_xor_ps(llr0, sign_mask);
219 4608 __m256 dst = _mm256_add_ps(llr0, llr1);
220 4608 return dst;
221 }
222
223 65520 static inline __m256 _mm256_accumulate_square_sum_ps(
224 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
225 {
226 65520 aux = _mm256_mul_ps(aux, val);
227 65520 aux = _mm256_sub_ps(aux, acc);
228 65520 aux = _mm256_mul_ps(aux, aux);
229 65520 aux = _mm256_mul_ps(aux, rec);
230 65520 return _mm256_add_ps(sq_acc, aux);
231 }
232
233 #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
234