GCC Code Coverage Report


Directory: ./
File: include/volk/volk_avx2_intrinsics.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 68 68 100.0%
Functions: 8 8 100.0%
Branches: 0 0 -%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*
11 * This file is intended to hold AVX2 intrinsics of intrinsics.
12 * They should be used in VOLK kernels to avoid copy-paste.
13 */
14
15 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
17 #include "volk/volk_avx_intrinsics.h"
18 #include <immintrin.h>
19
20 4608 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
21 {
22 4608 const __m128i zeros = _mm_set1_epi8(0x00);
23 4608 const __m128i sign_extract = _mm_set1_epi8(0x80);
24 4608 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
25 0xff,
26 0xff,
27 0x00,
28 0xff,
29 0xff,
30 0xff,
31 0x01,
32 0xff,
33 0xff,
34 0xff,
35 0x02,
36 0xff,
37 0xff,
38 0xff,
39 0x03,
40 0xff,
41 0xff,
42 0xff,
43 0x04,
44 0xff,
45 0xff,
46 0xff,
47 0x05,
48 0xff,
49 0xff,
50 0xff,
51 0x06,
52 0xff,
53 0xff,
54 0xff,
55 0x07);
56 4608 __m256i sign_bits = _mm256_setzero_si256();
57
58 4608 fbits = _mm_cmpgt_epi8(fbits, zeros);
59 4608 fbits = _mm_and_si128(fbits, sign_extract);
60 4608 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
61 4608 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
62 4608 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
63
64 4608 return _mm256_castsi256_ps(sign_bits);
65 }
66
67 static inline __m256
68 4608 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
69 {
70 // prepare sign mask for correct +-
71 4608 __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
72
73 __m256 llr0, llr1;
74 4608 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
75
76 // calculate result
77 4608 llr0 = _mm256_xor_ps(llr0, sign_mask);
78 4608 __m256 dst = _mm256_add_ps(llr0, llr1);
79 4608 return dst;
80 }
81
82 65532 static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
83 const __m256 cplxValue1)
84 {
85 65532 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
86 65532 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
87 65532 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
88 65532 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
89 65532 return _mm256_permutevar8x32_ps(complex_result, idx);
90 }
91
92 65532 static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
93 const __m256 symbols1,
94 const __m256 points0,
95 const __m256 points1,
96 const __m256 scalar)
97 {
98 /*
99 * Calculate: |y - x|^2 * SNR_lin
100 * Consider 'symbolsX' and 'pointsX' to be complex float
101 * 'symbolsX' are 'y' and 'pointsX' are 'x'
102 */
103 65532 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
104 65532 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
105 65532 const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
106 65532 return _mm256_mul_ps(norms, scalar);
107 }
108
109 /*
110 * The function below vectorizes the inner loop of the following code:
111 *
112 * float max_values[8] = {0.f};
113 * unsigned max_indices[8] = {0};
114 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
115 * for (unsigned i = 0; i < num_points / 8; ++i) {
116 * for (unsigned j = 0; j < 8; ++j) {
117 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
118 * bool compare = abs_squared > max_values[j];
119 * max_values[j] = compare ? abs_squared : max_values[j];
120 * max_indices[j] = compare ? current_indices[j] : max_indices[j]
121 * current_indices[j] += 8; // update for next outer loop iteration
122 * ++src0;
123 * }
124 * }
125 */
126 98296 static inline void vector_32fc_index_max_variant0(__m256 in0,
127 __m256 in1,
128 __m256* max_values,
129 __m256i* max_indices,
130 __m256i* current_indices,
131 __m256i indices_increment)
132 {
133 98296 in0 = _mm256_mul_ps(in0, in0);
134 98296 in1 = _mm256_mul_ps(in1, in1);
135
136 /*
137 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
138 * hadd_ps(a, b) computes
139 * (b_7 + b_6,
140 * b_5 + b_4,
141 * ---------
142 * a_7 + b_6,
143 * a_5 + a_4,
144 * ---------
145 * b_3 + b_2,
146 * b_1 + b_0,
147 * ---------
148 * a_3 + a_2,
149 * a_1 + a_0).
150 * The result is the squared absolute value of complex numbers at index
151 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
152 * current_indices!
153 */
154 98296 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
155
156 /*
157 * Compare the recently computed squared absolute values with the
158 * previously determined maximum values. cmp_ps(a, b) determines
159 * a > b ? 0xFFFFFFFF for each element in the vectors =>
160 * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
161 *
162 * If either operand is NaN, 0 is returned as an “ordered” comparision is
163 * used => the blend operation will select the value from *max_values.
164 */
165 98296 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
166
167 /* Select maximum by blending. This is the only line which differs from variant1 */
168 98296 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
169
170 /*
171 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
172 * each element in the vectors =>
173 * max_indices = compare_mask ? current_indices : max_indices
174 *
175 * Note: The casting of data types is required to make the compiler happy
176 * and does not change values.
177 */
178 98296 *max_indices =
179 393184 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
180 _mm256_castsi256_ps(*current_indices),
181 compare_mask));
182
183 /* compute indices of complex numbers which will be loaded in the next iteration */
184 98296 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
185 98296 }
186
187 /* See _variant0 for details */
188 98296 static inline void vector_32fc_index_max_variant1(__m256 in0,
189 __m256 in1,
190 __m256* max_values,
191 __m256i* max_indices,
192 __m256i* current_indices,
193 __m256i indices_increment)
194 {
195 98296 in0 = _mm256_mul_ps(in0, in0);
196 98296 in1 = _mm256_mul_ps(in1, in1);
197
198 98296 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
199 98296 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
200
201 /*
202 * This is the only line which differs from variant0. Using maxps instead of
203 * blendvps is faster on Intel CPUs (on the ones tested with).
204 *
205 * Note: The order of arguments matters if a NaN is encountered in which
206 * case the value of the second argument is selected. This is consistent
207 * with the “ordered” comparision and the blend operation: The comparision
208 * returns false if a NaN is encountered and the blend operation
209 * consequently selects the value from max_indices.
210 */
211 98296 *max_values = _mm256_max_ps(abs_squared, *max_values);
212
213 98296 *max_indices =
214 393184 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
215 _mm256_castsi256_ps(*current_indices),
216 compare_mask));
217
218 98296 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
219 98296 }
220
221 /*
222 * The function below vectorizes the inner loop of the following code:
223 *
224 * float min_values[8] = {FLT_MAX};
225 * unsigned min_indices[8] = {0};
226 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
227 * for (unsigned i = 0; i < num_points / 8; ++i) {
228 * for (unsigned j = 0; j < 8; ++j) {
229 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
230 * bool compare = abs_squared < min_values[j];
231 * min_values[j] = compare ? abs_squared : min_values[j];
232 * min_indices[j] = compare ? current_indices[j] : min_indices[j]
233 * current_indices[j] += 8; // update for next outer loop iteration
234 * ++src0;
235 * }
236 * }
237 */
238 98296 static inline void vector_32fc_index_min_variant0(__m256 in0,
239 __m256 in1,
240 __m256* min_values,
241 __m256i* min_indices,
242 __m256i* current_indices,
243 __m256i indices_increment)
244 {
245 98296 in0 = _mm256_mul_ps(in0, in0);
246 98296 in1 = _mm256_mul_ps(in1, in1);
247
248 /*
249 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
250 * hadd_ps(a, b) computes
251 * (b_7 + b_6,
252 * b_5 + b_4,
253 * ---------
254 * a_7 + b_6,
255 * a_5 + a_4,
256 * ---------
257 * b_3 + b_2,
258 * b_1 + b_0,
259 * ---------
260 * a_3 + a_2,
261 * a_1 + a_0).
262 * The result is the squared absolute value of complex numbers at index
263 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
264 * current_indices!
265 */
266 98296 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
267
268 /*
269 * Compare the recently computed squared absolute values with the
270 * previously determined minimum values. cmp_ps(a, b) determines
271 * a < b ? 0xFFFFFFFF for each element in the vectors =>
272 * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
273 *
274 * If either operand is NaN, 0 is returned as an “ordered” comparision is
275 * used => the blend operation will select the value from *min_values.
276 */
277 98296 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
278
279 /* Select minimum by blending. This is the only line which differs from variant1 */
280 98296 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
281
282 /*
283 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
284 * each element in the vectors =>
285 * min_indices = compare_mask ? current_indices : min_indices
286 *
287 * Note: The casting of data types is required to make the compiler happy
288 * and does not change values.
289 */
290 98296 *min_indices =
291 393184 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
292 _mm256_castsi256_ps(*current_indices),
293 compare_mask));
294
295 /* compute indices of complex numbers which will be loaded in the next iteration */
296 98296 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
297 98296 }
298
299 /* See _variant0 for details */
300 98296 static inline void vector_32fc_index_min_variant1(__m256 in0,
301 __m256 in1,
302 __m256* min_values,
303 __m256i* min_indices,
304 __m256i* current_indices,
305 __m256i indices_increment)
306 {
307 98296 in0 = _mm256_mul_ps(in0, in0);
308 98296 in1 = _mm256_mul_ps(in1, in1);
309
310 98296 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
311 98296 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
312
313 /*
314 * This is the only line which differs from variant0. Using maxps instead of
315 * blendvps is faster on Intel CPUs (on the ones tested with).
316 *
317 * Note: The order of arguments matters if a NaN is encountered in which
318 * case the value of the second argument is selected. This is consistent
319 * with the “ordered” comparision and the blend operation: The comparision
320 * returns false if a NaN is encountered and the blend operation
321 * consequently selects the value from min_indices.
322 */
323 98296 *min_values = _mm256_min_ps(abs_squared, *min_values);
324
325 98296 *min_indices =
326 393184 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
327 _mm256_castsi256_ps(*current_indices),
328 compare_mask));
329
330 98296 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
331 98296 }
332
333 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
334