GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_8u_polarbutterfly_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 154 156 98.7%
Functions: 9 9 100.0%
Branches: 42 44 95.5%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_8u_polarbutterfly_32f
12 *
13 * \b Overview
14 *
15 * decode butterfly for one bit in polar decoder graph.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * volk_32f_8u_polarbutterfly_32f(float* llrs, unsigned char* u,
20 * const int frame_size, const int frame_exp,
21 * const int stage, const int u_num, const int row)
22 * \endcode
23 *
24 * \b Inputs
25 * \li llrs: buffer with LLRs. contains received LLRs and already decoded LLRs.
26 * \li u: previously decoded bits
27 * \li frame_size: = 2 ^ frame_exp.
28 * \li frame_exp: power of 2 value for frame size.
29 * \li stage: value in range [0, frame_exp). start stage algorithm goes deeper.
30 * \li u_num: bit number currently to be decoded
31 * \li row: row in graph to start decoding.
32 *
33 * \b Outputs
34 * \li frame: necessary LLRs for bit [u_num] to be decoded
35 *
36 * \b Example
37 * \code
38 * int frame_exp = 10;
39 * int frame_size = 0x01 << frame_exp;
40 *
41 * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1),
42 * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned
43 * char) * frame_size * (frame_exp + 1), volk_get_alignment());
44 *
45 * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp,
46 * data)};
47 *
48 * unsigned int u_num;
49 * for(u_num = 0; u_num < frame_size; u_num++){
50 * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num,
51 * u_num);
52 * // next line could first search for frozen bit value and then do bit decision.
53 * u[u_num] = llrs[u_num] > 0 ? 0 : 1;
54 * }
55 *
56 * volk_free(llrs);
57 * volk_free(u);
58 * \endcode
59 */
60
61 #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
62 #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
63 #include <math.h>
64 #include <volk/volk_8u_x2_encodeframepolar_8u.h>
65
66 73728 static inline float llr_odd(const float la, const float lb)
67 {
68 73728 const float ala = fabsf(la);
69 73728 const float alb = fabsf(lb);
70
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 73728 times.
73728 return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
71 }
72
73 2048 static inline void llr_odd_stages(
74 float* llrs, int min_stage, const int depth, const int frame_size, const int row)
75 {
76 2048 int loop_stage = depth - 1;
77 float* dst_llr_ptr;
78 float* src_llr_ptr;
79 2048 int stage_size = 0x01 << loop_stage;
80
81 int el;
82
2/2
✓ Branch 0 taken 6144 times.
✓ Branch 1 taken 2048 times.
8192 while (min_stage <= loop_stage) {
83 6144 dst_llr_ptr = llrs + loop_stage * frame_size + row;
84 6144 src_llr_ptr = dst_llr_ptr + frame_size;
85
2/2
✓ Branch 0 taken 14336 times.
✓ Branch 1 taken 6144 times.
20480 for (el = 0; el < stage_size; el++) {
86 14336 *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
87 14336 src_llr_ptr += 2;
88 }
89
90 6144 --loop_stage;
91 6144 stage_size >>= 1;
92 }
93 2048 }
94
95 73728 static inline float llr_even(const float la, const float lb, const unsigned char f)
96 {
97
1/2
✓ Branch 0 taken 73728 times.
✗ Branch 1 not taken.
73728 switch (f) {
98 73728 case 0:
99 73728 return lb + la;
100 default:
101 return lb - la;
102 }
103 }
104
105 static inline void
106 55296 even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
107 {
108 55296 u++;
109 int i;
110
2/2
✓ Branch 0 taken 52430848 times.
✓ Branch 1 taken 55296 times.
52486144 for (i = 1; i < u_num; i += 2) {
111 52430848 *u_even++ = *u;
112 52430848 u += 2;
113 }
114 55296 }
115
116 static inline void
117 55296 odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
118 {
119 int i;
120
2/2
✓ Branch 0 taken 52430848 times.
✓ Branch 1 taken 55296 times.
52486144 for (i = 1; i < u_num; i += 2) {
121 52430848 *u_xor++ = *u ^ *(u + 1);
122 52430848 u += 2;
123 }
124 55296 }
125
126 8192 static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
127 {
128 8192 int max_stage_depth = 0;
129 8192 int half_stage_size = 0x01;
130 8192 int stage_size = half_stage_size << 1;
131
2/2
✓ Branch 0 taken 24560 times.
✓ Branch 1 taken 8 times.
24568 while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
132
2/2
✓ Branch 0 taken 8184 times.
✓ Branch 1 taken 16376 times.
24560 if (!(row % stage_size < half_stage_size)) {
133 8184 break;
134 }
135 16376 half_stage_size <<= 1;
136 16376 stage_size <<= 1;
137 16376 max_stage_depth++;
138 }
139 8192 return max_stage_depth;
140 }
141
142 #ifdef LV_HAVE_GENERIC
143
144 124928 static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
145 unsigned char* u,
146 const int frame_exp,
147 const int stage,
148 const int u_num,
149 const int row)
150 {
151 124928 const int frame_size = 0x01 << frame_exp;
152 124928 const int next_stage = stage + 1;
153
154 124928 const int half_stage_size = 0x01 << stage;
155 124928 const int stage_size = half_stage_size << 1;
156
157 124928 const bool is_upper_stage_half = row % stage_size < half_stage_size;
158
159 // // this is a natural bit order impl
160 124928 float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
161 124928 float* call_row_llr = llrs + row;
162
163 124928 const int section = row - (row % stage_size);
164 124928 const int jump_size = ((row % half_stage_size) << 1) % stage_size;
165
166 124928 const int next_upper_row = section + jump_size;
167 124928 const int next_lower_row = next_upper_row + 1;
168
169 124928 const float* upper_right_llr_ptr = next_llrs + next_upper_row;
170 124928 const float* lower_right_llr_ptr = next_llrs + next_lower_row;
171
172
2/2
✓ Branch 0 taken 65536 times.
✓ Branch 1 taken 59392 times.
124928 if (!is_upper_stage_half) {
173 65536 const int u_pos = u_num >> stage;
174 65536 const unsigned char f = u[u_pos - 1];
175 65536 *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
176 65536 return;
177 }
178
179
2/2
✓ Branch 0 taken 55296 times.
✓ Branch 1 taken 4096 times.
59392 if (frame_exp > next_stage) {
180 55296 unsigned char* u_half = u + frame_size;
181 55296 odd_xor_even_values(u_half, u, u_num);
182 55296 volk_32f_8u_polarbutterfly_32f_generic(
183 next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
184
185 55296 even_u_values(u_half, u, u_num);
186 55296 volk_32f_8u_polarbutterfly_32f_generic(
187 next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
188 }
189
190 59392 *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
191 }
192
193 #endif /* LV_HAVE_GENERIC */
194
195
196 #ifdef LV_HAVE_AVX
197 #include <immintrin.h>
198 #include <volk/volk_avx_intrinsics.h>
199
200 8192 static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
201 unsigned char* u,
202 const int frame_exp,
203 const int stage,
204 const int u_num,
205 const int row)
206 {
207 8192 const int frame_size = 0x01 << frame_exp;
208
2/2
✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 4096 times.
8192 if (row % 2) { // for odd rows just do the only necessary calculation and return.
209 4096 const float* next_llrs = llrs + frame_size + row;
210 4096 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
211 4096 return;
212 }
213
214 4096 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
215
2/2
✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 1024 times.
4096 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
216 3072 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
217 3072 return;
218 }
219
220 1024 int loop_stage = max_stage_depth;
221 1024 int stage_size = 0x01 << loop_stage;
222
223 float* src_llr_ptr;
224 float* dst_llr_ptr;
225
226 __m256 src0, src1, dst;
227
228
2/2
✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 2 times.
1024 if (row) { // not necessary for ZERO row. == first bit to be decoded.
229 // first do bit combination for all stages
230 // effectively encode some decoded bits again.
231 1022 unsigned char* u_target = u + frame_size;
232 1022 unsigned char* u_temp = u + 2 * frame_size;
233 1022 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
234
235 1022 volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
236
237 1022 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
238 1022 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
239
240 __m128i fbits;
241
242 int p;
243
2/2
✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
5630 for (p = 0; p < stage_size; p += 8) {
244 4608 fbits = _mm_loadu_si128((__m128i*)u_target);
245 4608 u_target += 8;
246
247 4608 src0 = _mm256_loadu_ps(src_llr_ptr);
248 4608 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
249 4608 src_llr_ptr += 16;
250
251 4608 dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
252
253 _mm256_storeu_ps(dst_llr_ptr, dst);
254 4608 dst_llr_ptr += 8;
255 }
256
257 1022 --loop_stage;
258 1022 stage_size >>= 1;
259 }
260
261 1024 const int min_stage = stage > 2 ? stage : 2;
262
263 _mm256_zeroall(); // Important to clear cache!
264
265 int el;
266
2/2
✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 1024 times.
2046 while (min_stage < loop_stage) {
267 1022 dst_llr_ptr = llrs + loop_stage * frame_size + row;
268 1022 src_llr_ptr = dst_llr_ptr + frame_size;
269
2/2
✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
5630 for (el = 0; el < stage_size; el += 8) {
270 4608 src0 = _mm256_loadu_ps(src_llr_ptr);
271 4608 src_llr_ptr += 8;
272 4608 src1 = _mm256_loadu_ps(src_llr_ptr);
273 4608 src_llr_ptr += 8;
274
275 4608 dst = _mm256_polar_minsum_llrs(src0, src1);
276
277 _mm256_storeu_ps(dst_llr_ptr, dst);
278 4608 dst_llr_ptr += 8;
279 }
280
281 1022 --loop_stage;
282 1022 stage_size >>= 1;
283 }
284
285 // for stages < 3 vectors are too small!.
286 1024 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
287 }
288
289 #endif /* LV_HAVE_AVX */
290
291 #ifdef LV_HAVE_AVX2
292 #include <immintrin.h>
293 #include <volk/volk_avx2_intrinsics.h>
294
295 8192 static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
296 unsigned char* u,
297 const int frame_exp,
298 const int stage,
299 const int u_num,
300 const int row)
301 {
302 8192 const int frame_size = 0x01 << frame_exp;
303
2/2
✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 4096 times.
8192 if (row % 2) { // for odd rows just do the only necessary calculation and return.
304 4096 const float* next_llrs = llrs + frame_size + row;
305 4096 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
306 4096 return;
307 }
308
309 4096 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
310
2/2
✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 1024 times.
4096 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
311 3072 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
312 3072 return;
313 }
314
315 1024 int loop_stage = max_stage_depth;
316 1024 int stage_size = 0x01 << loop_stage;
317
318 float* src_llr_ptr;
319 float* dst_llr_ptr;
320
321 __m256 src0, src1, dst;
322
323
2/2
✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 2 times.
1024 if (row) { // not necessary for ZERO row. == first bit to be decoded.
324 // first do bit combination for all stages
325 // effectively encode some decoded bits again.
326 1022 unsigned char* u_target = u + frame_size;
327 1022 unsigned char* u_temp = u + 2 * frame_size;
328 1022 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
329
330 1022 volk_8u_x2_encodeframepolar_8u_u_avx2(u_target, u_temp, stage_size);
331
332 1022 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
333 1022 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
334
335 __m128i fbits;
336
337 int p;
338
2/2
✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
5630 for (p = 0; p < stage_size; p += 8) {
339 4608 fbits = _mm_loadu_si128((__m128i*)u_target);
340 4608 u_target += 8;
341
342 4608 src0 = _mm256_loadu_ps(src_llr_ptr);
343 4608 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
344 4608 src_llr_ptr += 16;
345
346 4608 dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
347
348 _mm256_storeu_ps(dst_llr_ptr, dst);
349 4608 dst_llr_ptr += 8;
350 }
351
352 1022 --loop_stage;
353 1022 stage_size >>= 1;
354 }
355
356 1024 const int min_stage = stage > 2 ? stage : 2;
357
358 _mm256_zeroall(); // Important to clear cache!
359
360 int el;
361
2/2
✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 1024 times.
2046 while (min_stage < loop_stage) {
362 1022 dst_llr_ptr = llrs + loop_stage * frame_size + row;
363 1022 src_llr_ptr = dst_llr_ptr + frame_size;
364
2/2
✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
5630 for (el = 0; el < stage_size; el += 8) {
365 4608 src0 = _mm256_loadu_ps(src_llr_ptr);
366 4608 src_llr_ptr += 8;
367 4608 src1 = _mm256_loadu_ps(src_llr_ptr);
368 4608 src_llr_ptr += 8;
369
370 4608 dst = _mm256_polar_minsum_llrs(src0, src1);
371
372 _mm256_storeu_ps(dst_llr_ptr, dst);
373 4608 dst_llr_ptr += 8;
374 }
375
376 1022 --loop_stage;
377 1022 stage_size >>= 1;
378 }
379
380 // for stages < 3 vectors are too small!.
381 1024 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
382 }
383
384 #endif /* LV_HAVE_AVX2 */
385
386 #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
387