| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2015 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32f_8u_polarbutterfly_32f | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * decode butterfly for one bit in polar decoder graph. | ||
| 16 | * | ||
| 17 | * <b>Dispatcher Prototype</b> | ||
| 18 | * \code | ||
| 19 | * volk_32f_8u_polarbutterfly_32f(float* llrs, unsigned char* u, | ||
| 20 | * const int frame_size, const int frame_exp, | ||
| 21 | * const int stage, const int u_num, const int row) | ||
| 22 | * \endcode | ||
| 23 | * | ||
| 24 | * \b Inputs | ||
| 25 | * \li llrs: buffer with LLRs. contains received LLRs and already decoded LLRs. | ||
| 26 | * \li u: previously decoded bits | ||
| 27 | * \li frame_size: = 2 ^ frame_exp. | ||
| 28 | * \li frame_exp: power of 2 value for frame size. | ||
| 29 | * \li stage: value in range [0, frame_exp). start stage algorithm goes deeper. | ||
| 30 | * \li u_num: bit number currently to be decoded | ||
| 31 | * \li row: row in graph to start decoding. | ||
| 32 | * | ||
| 33 | * \b Outputs | ||
| 34 | * \li frame: necessary LLRs for bit [u_num] to be decoded | ||
| 35 | * | ||
| 36 | * \b Example | ||
| 37 | * \code | ||
| 38 | * int frame_exp = 10; | ||
| 39 | * int frame_size = 0x01 << frame_exp; | ||
| 40 | * | ||
| 41 | * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), | ||
| 42 | * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned | ||
| 43 | * char) * frame_size * (frame_exp + 1), volk_get_alignment()); | ||
| 44 | * | ||
| 45 | * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, | ||
| 46 | * data)}; | ||
| 47 | * | ||
| 48 | * unsigned int u_num; | ||
| 49 | * for(u_num = 0; u_num < frame_size; u_num++){ | ||
| 50 | * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, | ||
| 51 | * u_num); | ||
| 52 | * // next line could first search for frozen bit value and then do bit decision. | ||
| 53 | * u[u_num] = llrs[u_num] > 0 ? 0 : 1; | ||
| 54 | * } | ||
| 55 | * | ||
| 56 | * volk_free(llrs); | ||
| 57 | * volk_free(u); | ||
| 58 | * \endcode | ||
| 59 | */ | ||
| 60 | |||
| 61 | #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ | ||
| 62 | #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ | ||
| 63 | #include <math.h> | ||
| 64 | #include <volk/volk_8u_x2_encodeframepolar_8u.h> | ||
| 65 | |||
| 66 | 73728 | static inline float llr_odd(const float la, const float lb) | |
| 67 | { | ||
| 68 | 73728 | const float ala = fabsf(la); | |
| 69 | 73728 | const float alb = fabsf(lb); | |
| 70 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 73728 times.
|
73728 | return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); |
| 71 | } | ||
| 72 | |||
| 73 | 2048 | static inline void llr_odd_stages( | |
| 74 | float* llrs, int min_stage, const int depth, const int frame_size, const int row) | ||
| 75 | { | ||
| 76 | 2048 | int loop_stage = depth - 1; | |
| 77 | float* dst_llr_ptr; | ||
| 78 | float* src_llr_ptr; | ||
| 79 | 2048 | int stage_size = 0x01 << loop_stage; | |
| 80 | |||
| 81 | int el; | ||
| 82 |
2/2✓ Branch 0 taken 6144 times.
✓ Branch 1 taken 2048 times.
|
8192 | while (min_stage <= loop_stage) { |
| 83 | 6144 | dst_llr_ptr = llrs + loop_stage * frame_size + row; | |
| 84 | 6144 | src_llr_ptr = dst_llr_ptr + frame_size; | |
| 85 |
2/2✓ Branch 0 taken 14336 times.
✓ Branch 1 taken 6144 times.
|
20480 | for (el = 0; el < stage_size; el++) { |
| 86 | 14336 | *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1)); | |
| 87 | 14336 | src_llr_ptr += 2; | |
| 88 | } | ||
| 89 | |||
| 90 | 6144 | --loop_stage; | |
| 91 | 6144 | stage_size >>= 1; | |
| 92 | } | ||
| 93 | 2048 | } | |
| 94 | |||
| 95 | 73728 | static inline float llr_even(const float la, const float lb, const unsigned char f) | |
| 96 | { | ||
| 97 |
1/2✓ Branch 0 taken 73728 times.
✗ Branch 1 not taken.
|
73728 | switch (f) { |
| 98 | 73728 | case 0: | |
| 99 | 73728 | return lb + la; | |
| 100 | ✗ | default: | |
| 101 | ✗ | return lb - la; | |
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | static inline void | ||
| 106 | 55296 | even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num) | |
| 107 | { | ||
| 108 | 55296 | u++; | |
| 109 | int i; | ||
| 110 |
2/2✓ Branch 0 taken 52430848 times.
✓ Branch 1 taken 55296 times.
|
52486144 | for (i = 1; i < u_num; i += 2) { |
| 111 | 52430848 | *u_even++ = *u; | |
| 112 | 52430848 | u += 2; | |
| 113 | } | ||
| 114 | 55296 | } | |
| 115 | |||
| 116 | static inline void | ||
| 117 | 55296 | odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num) | |
| 118 | { | ||
| 119 | int i; | ||
| 120 |
2/2✓ Branch 0 taken 52430848 times.
✓ Branch 1 taken 55296 times.
|
52486144 | for (i = 1; i < u_num; i += 2) { |
| 121 | 52430848 | *u_xor++ = *u ^ *(u + 1); | |
| 122 | 52430848 | u += 2; | |
| 123 | } | ||
| 124 | 55296 | } | |
| 125 | |||
| 126 | 8192 | static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row) | |
| 127 | { | ||
| 128 | 8192 | int max_stage_depth = 0; | |
| 129 | 8192 | int half_stage_size = 0x01; | |
| 130 | 8192 | int stage_size = half_stage_size << 1; | |
| 131 |
2/2✓ Branch 0 taken 24560 times.
✓ Branch 1 taken 8 times.
|
24568 | while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values. |
| 132 |
2/2✓ Branch 0 taken 8184 times.
✓ Branch 1 taken 16376 times.
|
24560 | if (!(row % stage_size < half_stage_size)) { |
| 133 | 8184 | break; | |
| 134 | } | ||
| 135 | 16376 | half_stage_size <<= 1; | |
| 136 | 16376 | stage_size <<= 1; | |
| 137 | 16376 | max_stage_depth++; | |
| 138 | } | ||
| 139 | 8192 | return max_stage_depth; | |
| 140 | } | ||
| 141 | |||
| 142 | #ifdef LV_HAVE_GENERIC | ||
| 143 | |||
| 144 | 124928 | static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs, | |
| 145 | unsigned char* u, | ||
| 146 | const int frame_exp, | ||
| 147 | const int stage, | ||
| 148 | const int u_num, | ||
| 149 | const int row) | ||
| 150 | { | ||
| 151 | 124928 | const int frame_size = 0x01 << frame_exp; | |
| 152 | 124928 | const int next_stage = stage + 1; | |
| 153 | |||
| 154 | 124928 | const int half_stage_size = 0x01 << stage; | |
| 155 | 124928 | const int stage_size = half_stage_size << 1; | |
| 156 | |||
| 157 | 124928 | const bool is_upper_stage_half = row % stage_size < half_stage_size; | |
| 158 | |||
| 159 | // // this is a natural bit order impl | ||
| 160 | 124928 | float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array. | |
| 161 | 124928 | float* call_row_llr = llrs + row; | |
| 162 | |||
| 163 | 124928 | const int section = row - (row % stage_size); | |
| 164 | 124928 | const int jump_size = ((row % half_stage_size) << 1) % stage_size; | |
| 165 | |||
| 166 | 124928 | const int next_upper_row = section + jump_size; | |
| 167 | 124928 | const int next_lower_row = next_upper_row + 1; | |
| 168 | |||
| 169 | 124928 | const float* upper_right_llr_ptr = next_llrs + next_upper_row; | |
| 170 | 124928 | const float* lower_right_llr_ptr = next_llrs + next_lower_row; | |
| 171 | |||
| 172 |
2/2✓ Branch 0 taken 65536 times.
✓ Branch 1 taken 59392 times.
|
124928 | if (!is_upper_stage_half) { |
| 173 | 65536 | const int u_pos = u_num >> stage; | |
| 174 | 65536 | const unsigned char f = u[u_pos - 1]; | |
| 175 | 65536 | *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f); | |
| 176 | 65536 | return; | |
| 177 | } | ||
| 178 | |||
| 179 |
2/2✓ Branch 0 taken 55296 times.
✓ Branch 1 taken 4096 times.
|
59392 | if (frame_exp > next_stage) { |
| 180 | 55296 | unsigned char* u_half = u + frame_size; | |
| 181 | 55296 | odd_xor_even_values(u_half, u, u_num); | |
| 182 | 55296 | volk_32f_8u_polarbutterfly_32f_generic( | |
| 183 | next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); | ||
| 184 | |||
| 185 | 55296 | even_u_values(u_half, u, u_num); | |
| 186 | 55296 | volk_32f_8u_polarbutterfly_32f_generic( | |
| 187 | next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); | ||
| 188 | } | ||
| 189 | |||
| 190 | 59392 | *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); | |
| 191 | } | ||
| 192 | |||
| 193 | #endif /* LV_HAVE_GENERIC */ | ||
| 194 | |||
| 195 | |||
| 196 | #ifdef LV_HAVE_AVX | ||
| 197 | #include <immintrin.h> | ||
| 198 | #include <volk/volk_avx_intrinsics.h> | ||
| 199 | |||
| 200 | 8192 | static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, | |
| 201 | unsigned char* u, | ||
| 202 | const int frame_exp, | ||
| 203 | const int stage, | ||
| 204 | const int u_num, | ||
| 205 | const int row) | ||
| 206 | { | ||
| 207 | 8192 | const int frame_size = 0x01 << frame_exp; | |
| 208 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 4096 times.
|
8192 | if (row % 2) { // for odd rows just do the only necessary calculation and return. |
| 209 | 4096 | const float* next_llrs = llrs + frame_size + row; | |
| 210 | 4096 | *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); | |
| 211 | 4096 | return; | |
| 212 | } | ||
| 213 | |||
| 214 | 4096 | const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); | |
| 215 |
2/2✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 1024 times.
|
4096 | if (max_stage_depth < 3) { // vectorized version needs larger vectors. |
| 216 | 3072 | volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); | |
| 217 | 3072 | return; | |
| 218 | } | ||
| 219 | |||
| 220 | 1024 | int loop_stage = max_stage_depth; | |
| 221 | 1024 | int stage_size = 0x01 << loop_stage; | |
| 222 | |||
| 223 | float* src_llr_ptr; | ||
| 224 | float* dst_llr_ptr; | ||
| 225 | |||
| 226 | __m256 src0, src1, dst; | ||
| 227 | |||
| 228 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 2 times.
|
1024 | if (row) { // not necessary for ZERO row. == first bit to be decoded. |
| 229 | // first do bit combination for all stages | ||
| 230 | // effectively encode some decoded bits again. | ||
| 231 | 1022 | unsigned char* u_target = u + frame_size; | |
| 232 | 1022 | unsigned char* u_temp = u + 2 * frame_size; | |
| 233 | 1022 | memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); | |
| 234 | |||
| 235 | 1022 | volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); | |
| 236 | |||
| 237 | 1022 | src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; | |
| 238 | 1022 | dst_llr_ptr = llrs + max_stage_depth * frame_size + row; | |
| 239 | |||
| 240 | __m128i fbits; | ||
| 241 | |||
| 242 | int p; | ||
| 243 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (p = 0; p < stage_size; p += 8) { |
| 244 | 4608 | fbits = _mm_loadu_si128((__m128i*)u_target); | |
| 245 | 4608 | u_target += 8; | |
| 246 | |||
| 247 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
| 248 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr + 8); | |
| 249 | 4608 | src_llr_ptr += 16; | |
| 250 | |||
| 251 | 4608 | dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); | |
| 252 | |||
| 253 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
| 254 | 4608 | dst_llr_ptr += 8; | |
| 255 | } | ||
| 256 | |||
| 257 | 1022 | --loop_stage; | |
| 258 | 1022 | stage_size >>= 1; | |
| 259 | } | ||
| 260 | |||
| 261 | 1024 | const int min_stage = stage > 2 ? stage : 2; | |
| 262 | |||
| 263 | _mm256_zeroall(); // Important to clear cache! | ||
| 264 | |||
| 265 | int el; | ||
| 266 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 1024 times.
|
2046 | while (min_stage < loop_stage) { |
| 267 | 1022 | dst_llr_ptr = llrs + loop_stage * frame_size + row; | |
| 268 | 1022 | src_llr_ptr = dst_llr_ptr + frame_size; | |
| 269 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (el = 0; el < stage_size; el += 8) { |
| 270 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
| 271 | 4608 | src_llr_ptr += 8; | |
| 272 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr); | |
| 273 | 4608 | src_llr_ptr += 8; | |
| 274 | |||
| 275 | 4608 | dst = _mm256_polar_minsum_llrs(src0, src1); | |
| 276 | |||
| 277 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
| 278 | 4608 | dst_llr_ptr += 8; | |
| 279 | } | ||
| 280 | |||
| 281 | 1022 | --loop_stage; | |
| 282 | 1022 | stage_size >>= 1; | |
| 283 | } | ||
| 284 | |||
| 285 | // for stages < 3 vectors are too small!. | ||
| 286 | 1024 | llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); | |
| 287 | } | ||
| 288 | |||
| 289 | #endif /* LV_HAVE_AVX */ | ||
| 290 | |||
| 291 | #ifdef LV_HAVE_AVX2 | ||
| 292 | #include <immintrin.h> | ||
| 293 | #include <volk/volk_avx2_intrinsics.h> | ||
| 294 | |||
| 295 | 8192 | static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, | |
| 296 | unsigned char* u, | ||
| 297 | const int frame_exp, | ||
| 298 | const int stage, | ||
| 299 | const int u_num, | ||
| 300 | const int row) | ||
| 301 | { | ||
| 302 | 8192 | const int frame_size = 0x01 << frame_exp; | |
| 303 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 4096 times.
|
8192 | if (row % 2) { // for odd rows just do the only necessary calculation and return. |
| 304 | 4096 | const float* next_llrs = llrs + frame_size + row; | |
| 305 | 4096 | *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); | |
| 306 | 4096 | return; | |
| 307 | } | ||
| 308 | |||
| 309 | 4096 | const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); | |
| 310 |
2/2✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 1024 times.
|
4096 | if (max_stage_depth < 3) { // vectorized version needs larger vectors. |
| 311 | 3072 | volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); | |
| 312 | 3072 | return; | |
| 313 | } | ||
| 314 | |||
| 315 | 1024 | int loop_stage = max_stage_depth; | |
| 316 | 1024 | int stage_size = 0x01 << loop_stage; | |
| 317 | |||
| 318 | float* src_llr_ptr; | ||
| 319 | float* dst_llr_ptr; | ||
| 320 | |||
| 321 | __m256 src0, src1, dst; | ||
| 322 | |||
| 323 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 2 times.
|
1024 | if (row) { // not necessary for ZERO row. == first bit to be decoded. |
| 324 | // first do bit combination for all stages | ||
| 325 | // effectively encode some decoded bits again. | ||
| 326 | 1022 | unsigned char* u_target = u + frame_size; | |
| 327 | 1022 | unsigned char* u_temp = u + 2 * frame_size; | |
| 328 | 1022 | memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); | |
| 329 | |||
| 330 | 1022 | volk_8u_x2_encodeframepolar_8u_u_avx2(u_target, u_temp, stage_size); | |
| 331 | |||
| 332 | 1022 | src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; | |
| 333 | 1022 | dst_llr_ptr = llrs + max_stage_depth * frame_size + row; | |
| 334 | |||
| 335 | __m128i fbits; | ||
| 336 | |||
| 337 | int p; | ||
| 338 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (p = 0; p < stage_size; p += 8) { |
| 339 | 4608 | fbits = _mm_loadu_si128((__m128i*)u_target); | |
| 340 | 4608 | u_target += 8; | |
| 341 | |||
| 342 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
| 343 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr + 8); | |
| 344 | 4608 | src_llr_ptr += 16; | |
| 345 | |||
| 346 | 4608 | dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits); | |
| 347 | |||
| 348 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
| 349 | 4608 | dst_llr_ptr += 8; | |
| 350 | } | ||
| 351 | |||
| 352 | 1022 | --loop_stage; | |
| 353 | 1022 | stage_size >>= 1; | |
| 354 | } | ||
| 355 | |||
| 356 | 1024 | const int min_stage = stage > 2 ? stage : 2; | |
| 357 | |||
| 358 | _mm256_zeroall(); // Important to clear cache! | ||
| 359 | |||
| 360 | int el; | ||
| 361 |
2/2✓ Branch 0 taken 1022 times.
✓ Branch 1 taken 1024 times.
|
2046 | while (min_stage < loop_stage) { |
| 362 | 1022 | dst_llr_ptr = llrs + loop_stage * frame_size + row; | |
| 363 | 1022 | src_llr_ptr = dst_llr_ptr + frame_size; | |
| 364 |
2/2✓ Branch 0 taken 4608 times.
✓ Branch 1 taken 1022 times.
|
5630 | for (el = 0; el < stage_size; el += 8) { |
| 365 | 4608 | src0 = _mm256_loadu_ps(src_llr_ptr); | |
| 366 | 4608 | src_llr_ptr += 8; | |
| 367 | 4608 | src1 = _mm256_loadu_ps(src_llr_ptr); | |
| 368 | 4608 | src_llr_ptr += 8; | |
| 369 | |||
| 370 | 4608 | dst = _mm256_polar_minsum_llrs(src0, src1); | |
| 371 | |||
| 372 | _mm256_storeu_ps(dst_llr_ptr, dst); | ||
| 373 | 4608 | dst_llr_ptr += 8; | |
| 374 | } | ||
| 375 | |||
| 376 | 1022 | --loop_stage; | |
| 377 | 1022 | stage_size >>= 1; | |
| 378 | } | ||
| 379 | |||
| 380 | // for stages < 3 vectors are too small!. | ||
| 381 | 1024 | llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); | |
| 382 | } | ||
| 383 | |||
| 384 | #endif /* LV_HAVE_AVX2 */ | ||
| 385 | |||
| 386 | #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */ | ||
| 387 |