Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2015 Free Software Foundation, Inc. |
4 |
|
|
* |
5 |
|
|
* This file is part of VOLK |
6 |
|
|
* |
7 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
/* |
11 |
|
|
* This file is intended to hold AVX2 intrinsics of intrinsics. |
12 |
|
|
* They should be used in VOLK kernels to avoid copy-paste. |
13 |
|
|
*/ |
14 |
|
|
|
15 |
|
|
#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ |
16 |
|
|
#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ |
17 |
|
|
#include "volk/volk_avx_intrinsics.h" |
18 |
|
|
#include <immintrin.h> |
19 |
|
|
|
20 |
|
4608 |
static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits) |
21 |
|
|
{ |
22 |
|
4608 |
const __m128i zeros = _mm_set1_epi8(0x00); |
23 |
|
4608 |
const __m128i sign_extract = _mm_set1_epi8(0x80); |
24 |
|
4608 |
const __m256i shuffle_mask = _mm256_setr_epi8(0xff, |
25 |
|
|
0xff, |
26 |
|
|
0xff, |
27 |
|
|
0x00, |
28 |
|
|
0xff, |
29 |
|
|
0xff, |
30 |
|
|
0xff, |
31 |
|
|
0x01, |
32 |
|
|
0xff, |
33 |
|
|
0xff, |
34 |
|
|
0xff, |
35 |
|
|
0x02, |
36 |
|
|
0xff, |
37 |
|
|
0xff, |
38 |
|
|
0xff, |
39 |
|
|
0x03, |
40 |
|
|
0xff, |
41 |
|
|
0xff, |
42 |
|
|
0xff, |
43 |
|
|
0x04, |
44 |
|
|
0xff, |
45 |
|
|
0xff, |
46 |
|
|
0xff, |
47 |
|
|
0x05, |
48 |
|
|
0xff, |
49 |
|
|
0xff, |
50 |
|
|
0xff, |
51 |
|
|
0x06, |
52 |
|
|
0xff, |
53 |
|
|
0xff, |
54 |
|
|
0xff, |
55 |
|
|
0x07); |
56 |
|
4608 |
__m256i sign_bits = _mm256_setzero_si256(); |
57 |
|
|
|
58 |
|
4608 |
fbits = _mm_cmpgt_epi8(fbits, zeros); |
59 |
|
4608 |
fbits = _mm_and_si128(fbits, sign_extract); |
60 |
|
4608 |
sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0); |
61 |
|
4608 |
sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1); |
62 |
|
4608 |
sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); |
63 |
|
|
|
64 |
|
4608 |
return _mm256_castsi256_ps(sign_bits); |
65 |
|
|
} |
66 |
|
|
|
67 |
|
|
static inline __m256 |
68 |
|
4608 |
_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits) |
69 |
|
|
{ |
70 |
|
|
// prepare sign mask for correct +- |
71 |
|
4608 |
__m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits); |
72 |
|
|
|
73 |
|
|
__m256 llr0, llr1; |
74 |
|
4608 |
_mm256_polar_deinterleave(&llr0, &llr1, src0, src1); |
75 |
|
|
|
76 |
|
|
// calculate result |
77 |
|
4608 |
llr0 = _mm256_xor_ps(llr0, sign_mask); |
78 |
|
4608 |
__m256 dst = _mm256_add_ps(llr0, llr1); |
79 |
|
4608 |
return dst; |
80 |
|
|
} |
81 |
|
|
|
82 |
|
65532 |
static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, |
83 |
|
|
const __m256 cplxValue1) |
84 |
|
|
{ |
85 |
|
65532 |
const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); |
86 |
|
65532 |
const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values |
87 |
|
65532 |
const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values |
88 |
|
65532 |
const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); |
89 |
|
65532 |
return _mm256_permutevar8x32_ps(complex_result, idx); |
90 |
|
|
} |
91 |
|
|
|
92 |
|
65532 |
static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, |
93 |
|
|
const __m256 symbols1, |
94 |
|
|
const __m256 points0, |
95 |
|
|
const __m256 points1, |
96 |
|
|
const __m256 scalar) |
97 |
|
|
{ |
98 |
|
|
/* |
99 |
|
|
* Calculate: |y - x|^2 * SNR_lin |
100 |
|
|
* Consider 'symbolsX' and 'pointsX' to be complex float |
101 |
|
|
* 'symbolsX' are 'y' and 'pointsX' are 'x' |
102 |
|
|
*/ |
103 |
|
65532 |
const __m256 diff0 = _mm256_sub_ps(symbols0, points0); |
104 |
|
65532 |
const __m256 diff1 = _mm256_sub_ps(symbols1, points1); |
105 |
|
65532 |
const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); |
106 |
|
65532 |
return _mm256_mul_ps(norms, scalar); |
107 |
|
|
} |
108 |
|
|
|
109 |
|
|
/* |
110 |
|
|
* The function below vectorizes the inner loop of the following code: |
111 |
|
|
* |
112 |
|
|
* float max_values[8] = {0.f}; |
113 |
|
|
* unsigned max_indices[8] = {0}; |
114 |
|
|
* unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7}; |
115 |
|
|
* for (unsigned i = 0; i < num_points / 8; ++i) { |
116 |
|
|
* for (unsigned j = 0; j < 8; ++j) { |
117 |
|
|
* float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) |
118 |
|
|
* bool compare = abs_squared > max_values[j]; |
119 |
|
|
* max_values[j] = compare ? abs_squared : max_values[j]; |
120 |
|
|
* max_indices[j] = compare ? current_indices[j] : max_indices[j] |
121 |
|
|
* current_indices[j] += 8; // update for next outer loop iteration |
122 |
|
|
* ++src0; |
123 |
|
|
* } |
124 |
|
|
* } |
125 |
|
|
*/ |
126 |
|
98296 |
static inline void vector_32fc_index_max_variant0(__m256 in0, |
127 |
|
|
__m256 in1, |
128 |
|
|
__m256* max_values, |
129 |
|
|
__m256i* max_indices, |
130 |
|
|
__m256i* current_indices, |
131 |
|
|
__m256i indices_increment) |
132 |
|
|
{ |
133 |
|
98296 |
in0 = _mm256_mul_ps(in0, in0); |
134 |
|
98296 |
in1 = _mm256_mul_ps(in1, in1); |
135 |
|
|
|
136 |
|
|
/* |
137 |
|
|
* Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0) |
138 |
|
|
* hadd_ps(a, b) computes |
139 |
|
|
* (b_7 + b_6, |
140 |
|
|
* b_5 + b_4, |
141 |
|
|
* --------- |
142 |
|
|
* a_7 + b_6, |
143 |
|
|
* a_5 + a_4, |
144 |
|
|
* --------- |
145 |
|
|
* b_3 + b_2, |
146 |
|
|
* b_1 + b_0, |
147 |
|
|
* --------- |
148 |
|
|
* a_3 + a_2, |
149 |
|
|
* a_1 + a_0). |
150 |
|
|
* The result is the squared absolute value of complex numbers at index |
151 |
|
|
* offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of |
152 |
|
|
* current_indices! |
153 |
|
|
*/ |
154 |
|
98296 |
__m256 abs_squared = _mm256_hadd_ps(in0, in1); |
155 |
|
|
|
156 |
|
|
/* |
157 |
|
|
* Compare the recently computed squared absolute values with the |
158 |
|
|
* previously determined maximum values. cmp_ps(a, b) determines |
159 |
|
|
* a > b ? 0xFFFFFFFF for each element in the vectors => |
160 |
|
|
* compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0 |
161 |
|
|
* |
162 |
|
|
* If either operand is NaN, 0 is returned as an “ordered” comparision is |
163 |
|
|
* used => the blend operation will select the value from *max_values. |
164 |
|
|
*/ |
165 |
|
98296 |
__m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS); |
166 |
|
|
|
167 |
|
|
/* Select maximum by blending. This is the only line which differs from variant1 */ |
168 |
|
98296 |
*max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask); |
169 |
|
|
|
170 |
|
|
/* |
171 |
|
|
* Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for |
172 |
|
|
* each element in the vectors => |
173 |
|
|
* max_indices = compare_mask ? current_indices : max_indices |
174 |
|
|
* |
175 |
|
|
* Note: The casting of data types is required to make the compiler happy |
176 |
|
|
* and does not change values. |
177 |
|
|
*/ |
178 |
|
98296 |
*max_indices = |
179 |
|
393184 |
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices), |
180 |
|
|
_mm256_castsi256_ps(*current_indices), |
181 |
|
|
compare_mask)); |
182 |
|
|
|
183 |
|
|
/* compute indices of complex numbers which will be loaded in the next iteration */ |
184 |
|
98296 |
*current_indices = _mm256_add_epi32(*current_indices, indices_increment); |
185 |
|
98296 |
} |
186 |
|
|
|
187 |
|
|
/* See _variant0 for details */ |
188 |
|
98296 |
static inline void vector_32fc_index_max_variant1(__m256 in0, |
189 |
|
|
__m256 in1, |
190 |
|
|
__m256* max_values, |
191 |
|
|
__m256i* max_indices, |
192 |
|
|
__m256i* current_indices, |
193 |
|
|
__m256i indices_increment) |
194 |
|
|
{ |
195 |
|
98296 |
in0 = _mm256_mul_ps(in0, in0); |
196 |
|
98296 |
in1 = _mm256_mul_ps(in1, in1); |
197 |
|
|
|
198 |
|
98296 |
__m256 abs_squared = _mm256_hadd_ps(in0, in1); |
199 |
|
98296 |
__m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS); |
200 |
|
|
|
201 |
|
|
/* |
202 |
|
|
* This is the only line which differs from variant0. Using maxps instead of |
203 |
|
|
* blendvps is faster on Intel CPUs (on the ones tested with). |
204 |
|
|
* |
205 |
|
|
* Note: The order of arguments matters if a NaN is encountered in which |
206 |
|
|
* case the value of the second argument is selected. This is consistent |
207 |
|
|
* with the “ordered” comparision and the blend operation: The comparision |
208 |
|
|
* returns false if a NaN is encountered and the blend operation |
209 |
|
|
* consequently selects the value from max_indices. |
210 |
|
|
*/ |
211 |
|
98296 |
*max_values = _mm256_max_ps(abs_squared, *max_values); |
212 |
|
|
|
213 |
|
98296 |
*max_indices = |
214 |
|
393184 |
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices), |
215 |
|
|
_mm256_castsi256_ps(*current_indices), |
216 |
|
|
compare_mask)); |
217 |
|
|
|
218 |
|
98296 |
*current_indices = _mm256_add_epi32(*current_indices, indices_increment); |
219 |
|
98296 |
} |
220 |
|
|
|
221 |
|
|
/* |
222 |
|
|
* The function below vectorizes the inner loop of the following code: |
223 |
|
|
* |
224 |
|
|
* float min_values[8] = {FLT_MAX}; |
225 |
|
|
* unsigned min_indices[8] = {0}; |
226 |
|
|
* unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7}; |
227 |
|
|
* for (unsigned i = 0; i < num_points / 8; ++i) { |
228 |
|
|
* for (unsigned j = 0; j < 8; ++j) { |
229 |
|
|
* float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) |
230 |
|
|
* bool compare = abs_squared < min_values[j]; |
231 |
|
|
* min_values[j] = compare ? abs_squared : min_values[j]; |
232 |
|
|
* min_indices[j] = compare ? current_indices[j] : min_indices[j] |
233 |
|
|
* current_indices[j] += 8; // update for next outer loop iteration |
234 |
|
|
* ++src0; |
235 |
|
|
* } |
236 |
|
|
* } |
237 |
|
|
*/ |
238 |
|
98296 |
static inline void vector_32fc_index_min_variant0(__m256 in0, |
239 |
|
|
__m256 in1, |
240 |
|
|
__m256* min_values, |
241 |
|
|
__m256i* min_indices, |
242 |
|
|
__m256i* current_indices, |
243 |
|
|
__m256i indices_increment) |
244 |
|
|
{ |
245 |
|
98296 |
in0 = _mm256_mul_ps(in0, in0); |
246 |
|
98296 |
in1 = _mm256_mul_ps(in1, in1); |
247 |
|
|
|
248 |
|
|
/* |
249 |
|
|
* Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0) |
250 |
|
|
* hadd_ps(a, b) computes |
251 |
|
|
* (b_7 + b_6, |
252 |
|
|
* b_5 + b_4, |
253 |
|
|
* --------- |
254 |
|
|
* a_7 + b_6, |
255 |
|
|
* a_5 + a_4, |
256 |
|
|
* --------- |
257 |
|
|
* b_3 + b_2, |
258 |
|
|
* b_1 + b_0, |
259 |
|
|
* --------- |
260 |
|
|
* a_3 + a_2, |
261 |
|
|
* a_1 + a_0). |
262 |
|
|
* The result is the squared absolute value of complex numbers at index |
263 |
|
|
* offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of |
264 |
|
|
* current_indices! |
265 |
|
|
*/ |
266 |
|
98296 |
__m256 abs_squared = _mm256_hadd_ps(in0, in1); |
267 |
|
|
|
268 |
|
|
/* |
269 |
|
|
* Compare the recently computed squared absolute values with the |
270 |
|
|
* previously determined minimum values. cmp_ps(a, b) determines |
271 |
|
|
* a < b ? 0xFFFFFFFF for each element in the vectors => |
272 |
|
|
* compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0 |
273 |
|
|
* |
274 |
|
|
* If either operand is NaN, 0 is returned as an “ordered” comparision is |
275 |
|
|
* used => the blend operation will select the value from *min_values. |
276 |
|
|
*/ |
277 |
|
98296 |
__m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS); |
278 |
|
|
|
279 |
|
|
/* Select minimum by blending. This is the only line which differs from variant1 */ |
280 |
|
98296 |
*min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask); |
281 |
|
|
|
282 |
|
|
/* |
283 |
|
|
* Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for |
284 |
|
|
* each element in the vectors => |
285 |
|
|
* min_indices = compare_mask ? current_indices : min_indices |
286 |
|
|
* |
287 |
|
|
* Note: The casting of data types is required to make the compiler happy |
288 |
|
|
* and does not change values. |
289 |
|
|
*/ |
290 |
|
98296 |
*min_indices = |
291 |
|
393184 |
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices), |
292 |
|
|
_mm256_castsi256_ps(*current_indices), |
293 |
|
|
compare_mask)); |
294 |
|
|
|
295 |
|
|
/* compute indices of complex numbers which will be loaded in the next iteration */ |
296 |
|
98296 |
*current_indices = _mm256_add_epi32(*current_indices, indices_increment); |
297 |
|
98296 |
} |
298 |
|
|
|
299 |
|
|
/* See _variant0 for details */ |
300 |
|
98296 |
static inline void vector_32fc_index_min_variant1(__m256 in0, |
301 |
|
|
__m256 in1, |
302 |
|
|
__m256* min_values, |
303 |
|
|
__m256i* min_indices, |
304 |
|
|
__m256i* current_indices, |
305 |
|
|
__m256i indices_increment) |
306 |
|
|
{ |
307 |
|
98296 |
in0 = _mm256_mul_ps(in0, in0); |
308 |
|
98296 |
in1 = _mm256_mul_ps(in1, in1); |
309 |
|
|
|
310 |
|
98296 |
__m256 abs_squared = _mm256_hadd_ps(in0, in1); |
311 |
|
98296 |
__m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS); |
312 |
|
|
|
313 |
|
|
/* |
314 |
|
|
* This is the only line which differs from variant0. Using maxps instead of |
315 |
|
|
* blendvps is faster on Intel CPUs (on the ones tested with). |
316 |
|
|
* |
317 |
|
|
* Note: The order of arguments matters if a NaN is encountered in which |
318 |
|
|
* case the value of the second argument is selected. This is consistent |
319 |
|
|
* with the “ordered” comparision and the blend operation: The comparision |
320 |
|
|
* returns false if a NaN is encountered and the blend operation |
321 |
|
|
* consequently selects the value from min_indices. |
322 |
|
|
*/ |
323 |
|
98296 |
*min_values = _mm256_min_ps(abs_squared, *min_values); |
324 |
|
|
|
325 |
|
98296 |
*min_indices = |
326 |
|
393184 |
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices), |
327 |
|
|
_mm256_castsi256_ps(*current_indices), |
328 |
|
|
compare_mask)); |
329 |
|
|
|
330 |
|
98296 |
*current_indices = _mm256_add_epi32(*current_indices, indices_increment); |
331 |
|
98296 |
} |
332 |
|
|
|
333 |
|
|
#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ |
334 |
|
|
|