Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2015 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_8u_polarbutterfly_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * decode butterfly for one bit in polar decoder graph. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * volk_32f_8u_polarbutterfly_32f(float* llrs, unsigned char* u, | ||
20 | * const int frame_size, const int frame_exp, | ||
21 | * const int stage, const int u_num, const int row) | ||
22 | * \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li llrs: buffer with LLRs. contains received LLRs and already decoded LLRs. | ||
26 | * \li u: previously decoded bits | ||
27 | * \li frame_size: = 2 ^ frame_exp. | ||
28 | * \li frame_exp: power of 2 value for frame size. | ||
29 | * \li stage: value in range [0, frame_exp). start stage algorithm goes deeper. | ||
30 | * \li u_num: bit number currently to be decoded | ||
31 | * \li row: row in graph to start decoding. | ||
32 | * | ||
33 | * \b Outputs | ||
34 | * \li frame: necessary LLRs for bit [u_num] to be decoded | ||
35 | * | ||
36 | * \b Example | ||
37 | * \code | ||
38 | * int frame_exp = 10; | ||
39 | * int frame_size = 0x01 << frame_exp; | ||
40 | * | ||
41 | * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), | ||
42 | * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned | ||
43 | * char) * frame_size * (frame_exp + 1), volk_get_alignment()); | ||
44 | * | ||
45 | * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, | ||
46 | * data)}; | ||
47 | * | ||
48 | * unsigned int u_num; | ||
49 | * for(u_num = 0; u_num < frame_size; u_num++){ | ||
50 | * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, | ||
51 | * u_num); | ||
52 | * // next line could first search for frozen bit value and then do bit decision. | ||
53 | * u[u_num] = llrs[u_num] > 0 ? 0 : 1; | ||
54 | * } | ||
55 | * | ||
56 | * volk_free(llrs); | ||
57 | * volk_free(u); | ||
58 | * \endcode | ||
59 | */ | ||
60 | |||
61 | #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ | ||
62 | #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ | ||
63 | #include <math.h> | ||
64 | #include <volk/volk_8u_x2_encodeframepolar_8u.h> | ||
65 | |||
66 | 73728 | static inline float llr_odd(const float la, const float lb) | |
67 | { | ||
68 | 73728 | const float ala = fabsf(la); | |
69 | 73728 | const float alb = fabsf(lb); | |
70 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 73728 times.
|
73728 | return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); |
71 | } | ||
72 | |||
73 | 2048 | static inline void llr_odd_stages( | |
74 | float* llrs, int min_stage, const int depth, const int frame_size, const int row) | ||
75 | { | ||
76 | 2048 | int loop_stage = depth - 1; | |
77 | float* dst_llr_ptr; | ||
78 | float* src_llr_ptr; | ||
79 | 2048 | int stage_size = 0x01 << loop_stage; | |
80 | |||
81 | int el; | ||
82 |
2/2✓ Branch 0 taken 6144 times.
✓ Branch 1 taken 2048 times.
|
8192 | while (min_stage <= loop_stage) { |
83 | 6144 | dst_llr_ptr = llrs + loop_stage * frame_size + row; | |
84 | 6144 | src_llr_ptr = dst_llr_ptr + frame_size; | |
85 |
2/2✓ Branch 0 taken 14336 times.
✓ Branch 1 taken 6144 times.
|
20480 | for (el = 0; el < stage_size; el++) { |
86 | 14336 | *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1)); | |
87 | 14336 | src_llr_ptr += 2; | |
88 | } | ||
89 | |||
90 | 6144 | --loop_stage; | |
91 | 6144 | stage_size >>= 1; | |
92 | } | ||
93 | 2048 | } | |
94 | |||
95 | 73728 | static inline float llr_even(const float la, const float lb, const unsigned char f) | |
96 | { | ||
97 |
1/2✓ Branch 0 taken 73728 times.
✗ Branch 1 not taken.
|
73728 | switch (f) { |
98 | 73728 | case 0: | |
99 | 73728 | return lb + la; | |
100 | ✗ | default: | |
101 | ✗ | return lb - la; | |
102 | } | ||
103 | } | ||
104 | |||
105 | static inline void | ||
106 | 55296 | even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num) | |
107 | { | ||
108 | 55296 | u++; | |
109 | int i; | ||
110 |
2/2✓ Branch 0 taken 52430848 times.
✓ Branch 1 taken 55296 times.
|
52486144 | for (i = 1; i < u_num; i += 2) { |
111 | 52430848 | *u_even++ = *u; | |
112 | 52430848 | u += 2; | |
113 | } | ||
114 | 55296 | } | |
115 | |||
116 | static inline void | ||
117 | 55296 | odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num) | |
118 | { | ||
119 | int i; | ||
120 |
2/2✓ Branch 0 taken 52430848 times.
✓ Branch 1 taken 55296 times.
|
52486144 | for (i = 1; i < u_num; i += 2) { |
121 | 52430848 | *u_xor++ = *u ^ *(u + 1); | |
122 | 52430848 | u += 2; | |
123 | } | ||
124 | 55296 | } | |
125 | |||
126 | 8192 | static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row) | |
127 | { | ||
128 | 8192 | int max_stage_depth = 0; | |
129 | 8192 | int half_stage_size = 0x01; | |
130 | 8192 | int stage_size = half_stage_size << 1; | |
131 |
2/2✓ Branch 0 taken 24560 times.
✓ Branch 1 taken 8 times.
|
24568 | while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values. |
132 |
2/2✓ Branch 0 taken 8184 times.
✓ Branch 1 taken 16376 times.
|
24560 | if (!(row % stage_size < half_stage_size)) { |
133 | 8184 | break; | |
134 | } | ||
135 | 16376 | half_stage_size <<= 1; | |
136 | 16376 | stage_size <<= 1; | |
137 | 16376 | max_stage_depth++; | |
138 | } | ||
139 | 8192 | return max_stage_depth; | |
140 | } | ||
141 | |||
142 | #ifdef LV_HAVE_GENERIC | ||
143 | |||
144 | 124928 | static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs, | |
145 | unsigned char* u, | ||
146 | const int frame_exp, | ||
147 | const int stage, | ||
148 | const int u_num, | ||
149 | const int row) | ||
150 | { | ||
151 | 124928 | const int frame_size = 0x01 << frame_exp; | |
152 | 124928 | const int next_stage = stage + 1; | |
153 | |||
154 | 124928 | const int half_stage_size = 0x01 << stage; | |
155 | 124928 | const int stage_size = half_stage_size << 1; | |
156 | |||
157 | 124928 | const bool is_upper_stage_half = row % stage_size < half_stage_size; | |
158 | |||
159 | // // this is a natural bit order impl | ||
160 | 124928 | float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array. | |
161 | 124928 | float* call_row_llr = llrs + row; | |
162 | |||
163 | 124928 | const int section = row - (row % stage_size); | |
164 | 124928 | const int jump_size = ((row % half_stage_size) << 1) % stage_size; | |
165 | |||
166 | 124928 | const int next_upper_row = section + jump_size; | |
167 | 124928 | const int next_lower_row = next_upper_row + 1; | |
168 | |||
169 | 124928 | const float* upper_right_llr_ptr = next_llrs + next_upper_row; | |
170 | 124928 | const float* lower_right_llr_ptr = next_llrs + next_lower_row; | |
171 | |||
172 |
2/2✓ Branch 0 taken 65536 times.
✓ Branch 1 taken 59392 times.
|
124928 | if (!is_upper_stage_half) { |
173 | 65536 | const int u_pos = u_num >> stage; | |
174 | 65536 | const unsigned char f = u[u_pos - 1]; | |
175 | 65536 | *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f); | |
176 | 65536 | return; | |
177 | } | ||
178 | |||
179 |
2/2✓ Branch 0 taken 55296 times.
✓ Branch 1 taken 4096 times.
|
59392 | if (frame_exp > next_stage) { |
180 | 55296 | unsigned char* u_half = u + frame_size; | |
181 | 55296 | odd_xor_even_values(u_half, u, u_num); | |
182 | 55296 | volk_32f_8u_polarbutterfly_32f_generic( | |
183 | next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); | ||
184 | |||
185 | 55296 | even_u_values(u_half, u, u_num); | |
186 | 55296 | volk_32f_8u_polarbutterfly_32f_generic( | |
187 | next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); | ||
188 | } | ||
189 | |||
190 | 59392 | *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); | |
191 | } | ||
192 | |||
193 | #endif /* LV_HAVE_GENERIC */ | ||
194 | |||
195 | |||
196 | #ifdef LV_HAVE_AVX | ||
197 | #include <immintrin.h> | ||
198 | #include <volk/volk_avx_intrinsics.h> | ||
199 | |||
200 | 8192 | static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, | |
201 | unsigned char* u, | ||
202 | const int frame_exp, | ||
203 | const int stage, | ||
204 | const int u_num, | ||
205 | const int row) | ||
206 | { | ||
207 | 8192 | const int frame_size = 0x01 << frame_exp; | |
208 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 4096 times.
|
8192 | if (row % 2) { // for odd rows just do the only necessary calculation and return. |
209 | 4096 | const float* next_llrs = llrs + frame_size + row; | |
210 | 4096 | *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); | |
211 | 4096 | return; | |
212 | } | ||
213 | |||
214 | 4096 | const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); | |
215 |
2/2✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 1024 times.
|
4096 | if (max_stage_depth < 3) { // vectorized version needs larger vectors. |
216 | 3072 | volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); | |
217 | 3072 | return; | |
218 | } | ||
219 | |||
220 | 1024 | int loop_stage = max_stage_depth; | |
221 | 1024 | int stage_size = 0x01 << loop_stage; | |
222 | |||
223 | float* src_llr_ptr; | ||
224 | float* dst_llr_ptr; | ||
225 | |||
226 | __m256 src0, src1, dst; | ||
227 | |||
228 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 2 times.
|
1024 | if (row) { // not necessary for ZERO row. == first bit to be decoded. |
229 | // first do bit combination for all stages | ||
230 | // effectively encode some decoded bits again. | ||
231 | 1022 | unsigned char* u_target = u + frame_size; | |
232 | 1022 | unsigned char* u_temp = u + 2 * frame_size; | |
233 | 1022 | memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); | |
234 | |||
235 | 1022 | volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); | |
236 | |||
237 | 1022 | src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; | |
238 | 1022 | dst_llr_ptr = llrs + max_stage_depth * frame_size + row; | |
239 | |||
240 | __m128i fbits; | ||
241 | |||
242 | int p; | ||
243 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (p = 0; p < stage_size; p += 8) { |
244 | 4608 | fbits = _mm_loadu_si128((__m128i*)u_target); | |
245 | 4608 | u_target += 8; | |
246 | |||
247 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
248 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr + 8); | |
249 | 4608 | src_llr_ptr += 16; | |
250 | |||
251 | 4608 | dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); | |
252 | |||
253 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
254 | 4608 | dst_llr_ptr += 8; | |
255 | } | ||
256 | |||
257 | 1022 | --loop_stage; | |
258 | 1022 | stage_size >>= 1; | |
259 | } | ||
260 | |||
261 | 1024 | const int min_stage = stage > 2 ? stage : 2; | |
262 | |||
263 | _mm256_zeroall(); // Important to clear cache! | ||
264 | |||
265 | int el; | ||
266 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 1024 times.
|
2046 | while (min_stage < loop_stage) { |
267 | 1022 | dst_llr_ptr = llrs + loop_stage * frame_size + row; | |
268 | 1022 | src_llr_ptr = dst_llr_ptr + frame_size; | |
269 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (el = 0; el < stage_size; el += 8) { |
270 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
271 | 4608 | src_llr_ptr += 8; | |
272 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr); | |
273 | 4608 | src_llr_ptr += 8; | |
274 | |||
275 | 4608 | dst = _mm256_polar_minsum_llrs(src0, src1); | |
276 | |||
277 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
278 | 4608 | dst_llr_ptr += 8; | |
279 | } | ||
280 | |||
281 | 1022 | --loop_stage; | |
282 | 1022 | stage_size >>= 1; | |
283 | } | ||
284 | |||
285 | // for stages < 3 vectors are too small!. | ||
286 | 1024 | llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); | |
287 | } | ||
288 | |||
289 | #endif /* LV_HAVE_AVX */ | ||
290 | |||
291 | #ifdef LV_HAVE_AVX2 | ||
292 | #include <immintrin.h> | ||
293 | #include <volk/volk_avx2_intrinsics.h> | ||
294 | |||
295 | 8192 | static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, | |
296 | unsigned char* u, | ||
297 | const int frame_exp, | ||
298 | const int stage, | ||
299 | const int u_num, | ||
300 | const int row) | ||
301 | { | ||
302 | 8192 | const int frame_size = 0x01 << frame_exp; | |
303 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 4096 times.
|
8192 | if (row % 2) { // for odd rows just do the only necessary calculation and return. |
304 | 4096 | const float* next_llrs = llrs + frame_size + row; | |
305 | 4096 | *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); | |
306 | 4096 | return; | |
307 | } | ||
308 | |||
309 | 4096 | const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); | |
310 |
2/2✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 1024 times.
|
4096 | if (max_stage_depth < 3) { // vectorized version needs larger vectors. |
311 | 3072 | volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); | |
312 | 3072 | return; | |
313 | } | ||
314 | |||
315 | 1024 | int loop_stage = max_stage_depth; | |
316 | 1024 | int stage_size = 0x01 << loop_stage; | |
317 | |||
318 | float* src_llr_ptr; | ||
319 | float* dst_llr_ptr; | ||
320 | |||
321 | __m256 src0, src1, dst; | ||
322 | |||
323 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 2 times.
|
1024 | if (row) { // not necessary for ZERO row. == first bit to be decoded. |
324 | // first do bit combination for all stages | ||
325 | // effectively encode some decoded bits again. | ||
326 | 1022 | unsigned char* u_target = u + frame_size; | |
327 | 1022 | unsigned char* u_temp = u + 2 * frame_size; | |
328 | 1022 | memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); | |
329 | |||
330 | 1022 | volk_8u_x2_encodeframepolar_8u_u_avx2(u_target, u_temp, stage_size); | |
331 | |||
332 | 1022 | src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; | |
333 | 1022 | dst_llr_ptr = llrs + max_stage_depth * frame_size + row; | |
334 | |||
335 | __m128i fbits; | ||
336 | |||
337 | int p; | ||
338 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (p = 0; p < stage_size; p += 8) { |
339 | 4608 | fbits = _mm_loadu_si128((__m128i*)u_target); | |
340 | 4608 | u_target += 8; | |
341 | |||
342 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
343 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr + 8); | |
344 | 4608 | src_llr_ptr += 16; | |
345 | |||
346 | 4608 | dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits); | |
347 | |||
348 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
349 | 4608 | dst_llr_ptr += 8; | |
350 | } | ||
351 | |||
352 | 1022 | --loop_stage; | |
353 | 1022 | stage_size >>= 1; | |
354 | } | ||
355 | |||
356 | 1024 | const int min_stage = stage > 2 ? stage : 2; | |
357 | |||
358 | _mm256_zeroall(); // Important to clear cache! | ||
359 | |||
360 | int el; | ||
361 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 1024 times.
|
2046 | while (min_stage < loop_stage) { |
362 | 1022 | dst_llr_ptr = llrs + loop_stage * frame_size + row; | |
363 | 1022 | src_llr_ptr = dst_llr_ptr + frame_size; | |
364 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (el = 0; el < stage_size; el += 8) { |
365 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
366 | 4608 | src_llr_ptr += 8; | |
367 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr); | |
368 | 4608 | src_llr_ptr += 8; | |
369 | |||
370 | 4608 | dst = _mm256_polar_minsum_llrs(src0, src1); | |
371 | |||
372 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
373 | 4608 | dst_llr_ptr += 8; | |
374 | } | ||
375 | |||
376 | 1022 | --loop_stage; | |
377 | 1022 | stage_size >>= 1; | |
378 | } | ||
379 | |||
380 | // for stages < 3 vectors are too small!. | ||
381 | 1024 | llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); | |
382 | } | ||
383 | |||
384 | #endif /* LV_HAVE_AVX2 */ | ||
385 | |||
386 | #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */ | ||
387 |