| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2015 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /* | ||
| 11 | * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h' | ||
| 12 | */ | ||
| 13 | |||
| 14 | #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ | ||
| 15 | #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ | ||
| 16 | #include <string.h> | ||
| 17 | |||
| 18 | 2072 | static inline unsigned int log2_of_power_of_2(unsigned int val) | |
| 19 | { | ||
| 20 | // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog | ||
| 21 | static const unsigned int b[] = { | ||
| 22 | 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000 | ||
| 23 | }; | ||
| 24 | |||
| 25 | 2072 | unsigned int res = (val & b[0]) != 0; | |
| 26 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2056 times.
|
2072 | res |= ((val & b[4]) != 0) << 4; |
| 27 |
2/2✓ Branch 0 taken 72 times.
✓ Branch 1 taken 2000 times.
|
2072 | res |= ((val & b[3]) != 0) << 3; |
| 28 |
2/2✓ Branch 0 taken 972 times.
✓ Branch 1 taken 1100 times.
|
2072 | res |= ((val & b[2]) != 0) << 2; |
| 29 |
2/2✓ Branch 0 taken 1228 times.
✓ Branch 1 taken 844 times.
|
2072 | res |= ((val & b[1]) != 0) << 1; |
| 30 | 2072 | return res; | |
| 31 | } | ||
| 32 | |||
| 33 | 4200 | static inline void encodepolar_single_stage(unsigned char* frame_ptr, | |
| 34 | const unsigned char* temp_ptr, | ||
| 35 | const unsigned int num_branches, | ||
| 36 | const unsigned int frame_half) | ||
| 37 | { | ||
| 38 | unsigned int branch, bit; | ||
| 39 |
2/2✓ Branch 0 taken 166648 times.
✓ Branch 1 taken 4200 times.
|
170848 | for (branch = 0; branch < num_branches; ++branch) { |
| 40 |
2/2✓ Branch 0 taken 1216512 times.
✓ Branch 1 taken 166648 times.
|
1383160 | for (bit = 0; bit < frame_half; ++bit) { |
| 41 | 1216512 | *frame_ptr = *temp_ptr ^ *(temp_ptr + 1); | |
| 42 | 1216512 | *(frame_ptr + frame_half) = *(temp_ptr + 1); | |
| 43 | 1216512 | ++frame_ptr; | |
| 44 | 1216512 | temp_ptr += 2; | |
| 45 | } | ||
| 46 | 166648 | frame_ptr += frame_half; | |
| 47 | } | ||
| 48 | 4200 | } | |
| 49 | |||
| 50 | #ifdef LV_HAVE_GENERIC | ||
| 51 | |||
| 52 | 1288 | static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, | |
| 53 | unsigned char* temp, | ||
| 54 | unsigned int frame_size) | ||
| 55 | { | ||
| 56 | 1288 | unsigned int stage = log2_of_power_of_2(frame_size); | |
| 57 | 1288 | unsigned int frame_half = frame_size >> 1; | |
| 58 | 1288 | unsigned int num_branches = 1; | |
| 59 | |||
| 60 |
2/2✓ Branch 0 taken 4200 times.
✓ Branch 1 taken 1288 times.
|
5488 | while (stage) { |
| 61 | // encode stage | ||
| 62 | 4200 | encodepolar_single_stage(frame, temp, num_branches, frame_half); | |
| 63 | 4200 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
| 64 | |||
| 65 | // update all the parameters. | ||
| 66 | 4200 | num_branches = num_branches << 1; | |
| 67 | 4200 | frame_half = frame_half >> 1; | |
| 68 | 4200 | --stage; | |
| 69 | } | ||
| 70 | 1288 | } | |
| 71 | #endif /* LV_HAVE_GENERIC */ | ||
| 72 | |||
| 73 | #ifdef LV_HAVE_SSSE3 | ||
| 74 | #include <tmmintrin.h> | ||
| 75 | |||
| 76 | 1024 | static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, | |
| 77 | unsigned char* temp, | ||
| 78 | unsigned int frame_size) | ||
| 79 | { | ||
| 80 |
2/2✓ Branch 0 taken 512 times.
✓ Branch 1 taken 512 times.
|
1024 | if (frame_size < 16) { |
| 81 | 512 | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
| 82 | 512 | return; | |
| 83 | } | ||
| 84 | |||
| 85 | 512 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
| 86 | |||
| 87 | 512 | unsigned int stage = po2; | |
| 88 | 512 | unsigned char* frame_ptr = frame; | |
| 89 | 512 | unsigned char* temp_ptr = temp; | |
| 90 | |||
| 91 | 512 | unsigned int frame_half = frame_size >> 1; | |
| 92 | 512 | unsigned int num_branches = 1; | |
| 93 | unsigned int branch; | ||
| 94 | unsigned int bit; | ||
| 95 | |||
| 96 | // prepare constants | ||
| 97 | 512 | const __m128i mask_stage1 = _mm_set_epi8(0x0, | |
| 98 | 0xFF, | ||
| 99 | 0x0, | ||
| 100 | 0xFF, | ||
| 101 | 0x0, | ||
| 102 | 0xFF, | ||
| 103 | 0x0, | ||
| 104 | 0xFF, | ||
| 105 | 0x0, | ||
| 106 | 0xFF, | ||
| 107 | 0x0, | ||
| 108 | 0xFF, | ||
| 109 | 0x0, | ||
| 110 | 0xFF, | ||
| 111 | 0x0, | ||
| 112 | 0xFF); | ||
| 113 | |||
| 114 | // get some SIMD registers to play with. | ||
| 115 | __m128i r_frame0, r_temp0, shifted; | ||
| 116 | |||
| 117 | { | ||
| 118 | __m128i r_frame1, r_temp1; | ||
| 119 | const __m128i shuffle_separate = | ||
| 120 | 512 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
| 121 | |||
| 122 |
2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 512 times.
|
1030 | while (stage > 4) { |
| 123 | 518 | frame_ptr = frame; | |
| 124 | 518 | temp_ptr = temp; | |
| 125 | |||
| 126 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
| 127 |
2/2✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.
|
10246 | for (branch = 0; branch < num_branches; ++branch) { |
| 128 |
2/2✓ Branch 0 taken 52736 times.
✓ Branch 1 taken 9728 times.
|
62464 | for (bit = 0; bit < frame_half; bit += 16) { |
| 129 | 52736 | r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); | |
| 130 | 52736 | temp_ptr += 16; | |
| 131 | 52736 | r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr); | |
| 132 | 52736 | temp_ptr += 16; | |
| 133 | |||
| 134 | 52736 | shifted = _mm_srli_si128(r_temp0, 1); | |
| 135 | 52736 | shifted = _mm_and_si128(shifted, mask_stage1); | |
| 136 | 52736 | r_temp0 = _mm_xor_si128(shifted, r_temp0); | |
| 137 | 52736 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); | |
| 138 | |||
| 139 | 52736 | shifted = _mm_srli_si128(r_temp1, 1); | |
| 140 | 52736 | shifted = _mm_and_si128(shifted, mask_stage1); | |
| 141 | 52736 | r_temp1 = _mm_xor_si128(shifted, r_temp1); | |
| 142 | 52736 | r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); | |
| 143 | |||
| 144 | 52736 | r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); | |
| 145 | _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); | ||
| 146 | |||
| 147 | 52736 | r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); | |
| 148 | 52736 | _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1); | |
| 149 | 52736 | frame_ptr += 16; | |
| 150 | } | ||
| 151 | |||
| 152 | 9728 | frame_ptr += frame_half; | |
| 153 | } | ||
| 154 | 518 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
| 155 | |||
| 156 | 518 | num_branches = num_branches << 1; | |
| 157 | 518 | frame_half = frame_half >> 1; | |
| 158 | 518 | stage--; | |
| 159 | } | ||
| 160 | } | ||
| 161 | |||
| 162 | // This last part requires at least 16-bit frames. | ||
| 163 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
| 164 | |||
| 165 | // reset pointers to correct positions. | ||
| 166 | 512 | frame_ptr = frame; | |
| 167 | 512 | temp_ptr = temp; | |
| 168 | |||
| 169 | // prefetch first chunk | ||
| 170 | 512 | __VOLK_PREFETCH(temp_ptr); | |
| 171 | |||
| 172 | const __m128i shuffle_stage4 = | ||
| 173 | 512 | _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); | |
| 174 | 512 | const __m128i mask_stage4 = _mm_set_epi8(0x0, | |
| 175 | 0x0, | ||
| 176 | 0x0, | ||
| 177 | 0x0, | ||
| 178 | 0x0, | ||
| 179 | 0x0, | ||
| 180 | 0x0, | ||
| 181 | 0x0, | ||
| 182 | 0xFF, | ||
| 183 | 0xFF, | ||
| 184 | 0xFF, | ||
| 185 | 0xFF, | ||
| 186 | 0xFF, | ||
| 187 | 0xFF, | ||
| 188 | 0xFF, | ||
| 189 | 0xFF); | ||
| 190 | 512 | const __m128i mask_stage3 = _mm_set_epi8(0x0, | |
| 191 | 0x0, | ||
| 192 | 0x0, | ||
| 193 | 0x0, | ||
| 194 | 0xFF, | ||
| 195 | 0xFF, | ||
| 196 | 0xFF, | ||
| 197 | 0xFF, | ||
| 198 | 0x0, | ||
| 199 | 0x0, | ||
| 200 | 0x0, | ||
| 201 | 0x0, | ||
| 202 | 0xFF, | ||
| 203 | 0xFF, | ||
| 204 | 0xFF, | ||
| 205 | 0xFF); | ||
| 206 | 512 | const __m128i mask_stage2 = _mm_set_epi8(0x0, | |
| 207 | 0x0, | ||
| 208 | 0xFF, | ||
| 209 | 0xFF, | ||
| 210 | 0x0, | ||
| 211 | 0x0, | ||
| 212 | 0xFF, | ||
| 213 | 0xFF, | ||
| 214 | 0x0, | ||
| 215 | 0x0, | ||
| 216 | 0xFF, | ||
| 217 | 0xFF, | ||
| 218 | 0x0, | ||
| 219 | 0x0, | ||
| 220 | 0xFF, | ||
| 221 | 0xFF); | ||
| 222 | |||
| 223 |
2/2✓ Branch 0 taken 10240 times.
✓ Branch 1 taken 512 times.
|
10752 | for (branch = 0; branch < num_branches; ++branch) { |
| 224 | 10240 | r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); | |
| 225 | |||
| 226 | // prefetch next chunk | ||
| 227 | 10240 | temp_ptr += 16; | |
| 228 | 10240 | __VOLK_PREFETCH(temp_ptr); | |
| 229 | |||
| 230 | // shuffle once for bit-reversal. | ||
| 231 | 10240 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); | |
| 232 | |||
| 233 | 10240 | shifted = _mm_srli_si128(r_temp0, 8); | |
| 234 | 10240 | shifted = _mm_and_si128(shifted, mask_stage4); | |
| 235 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_temp0); | |
| 236 | |||
| 237 | 10240 | shifted = _mm_srli_si128(r_frame0, 4); | |
| 238 | 10240 | shifted = _mm_and_si128(shifted, mask_stage3); | |
| 239 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
| 240 | |||
| 241 | 10240 | shifted = _mm_srli_si128(r_frame0, 2); | |
| 242 | 10240 | shifted = _mm_and_si128(shifted, mask_stage2); | |
| 243 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
| 244 | |||
| 245 | 10240 | shifted = _mm_srli_si128(r_frame0, 1); | |
| 246 | 10240 | shifted = _mm_and_si128(shifted, mask_stage1); | |
| 247 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
| 248 | |||
| 249 | // store result of chunk. | ||
| 250 | _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); | ||
| 251 | 10240 | frame_ptr += 16; | |
| 252 | } | ||
| 253 | } | ||
| 254 | |||
| 255 | #endif /* LV_HAVE_SSSE3 */ | ||
| 256 | |||
| 257 | #ifdef LV_HAVE_AVX2 | ||
| 258 | #include <immintrin.h> | ||
| 259 | |||
| 260 | 1024 | static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, | |
| 261 | unsigned char* temp, | ||
| 262 | unsigned int frame_size) | ||
| 263 | { | ||
| 264 |
2/2✓ Branch 0 taken 768 times.
✓ Branch 1 taken 256 times.
|
1024 | if (frame_size < 32) { |
| 265 | 768 | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
| 266 | 768 | return; | |
| 267 | } | ||
| 268 | |||
| 269 | 256 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
| 270 | |||
| 271 | 256 | unsigned int stage = po2; | |
| 272 | 256 | unsigned char* frame_ptr = frame; | |
| 273 | 256 | unsigned char* temp_ptr = temp; | |
| 274 | |||
| 275 | 256 | unsigned int frame_half = frame_size >> 1; | |
| 276 | 256 | unsigned int num_branches = 1; | |
| 277 | unsigned int branch; | ||
| 278 | unsigned int bit; | ||
| 279 | |||
| 280 | // prepare constants | ||
| 281 | 256 | const __m256i mask_stage1 = _mm256_set_epi8(0x0, | |
| 282 | 0xFF, | ||
| 283 | 0x0, | ||
| 284 | 0xFF, | ||
| 285 | 0x0, | ||
| 286 | 0xFF, | ||
| 287 | 0x0, | ||
| 288 | 0xFF, | ||
| 289 | 0x0, | ||
| 290 | 0xFF, | ||
| 291 | 0x0, | ||
| 292 | 0xFF, | ||
| 293 | 0x0, | ||
| 294 | 0xFF, | ||
| 295 | 0x0, | ||
| 296 | 0xFF, | ||
| 297 | 0x0, | ||
| 298 | 0xFF, | ||
| 299 | 0x0, | ||
| 300 | 0xFF, | ||
| 301 | 0x0, | ||
| 302 | 0xFF, | ||
| 303 | 0x0, | ||
| 304 | 0xFF, | ||
| 305 | 0x0, | ||
| 306 | 0xFF, | ||
| 307 | 0x0, | ||
| 308 | 0xFF, | ||
| 309 | 0x0, | ||
| 310 | 0xFF, | ||
| 311 | 0x0, | ||
| 312 | 0xFF); | ||
| 313 | |||
| 314 | 256 | const __m128i mask_stage0 = _mm_set_epi8(0x0, | |
| 315 | 0xFF, | ||
| 316 | 0x0, | ||
| 317 | 0xFF, | ||
| 318 | 0x0, | ||
| 319 | 0xFF, | ||
| 320 | 0x0, | ||
| 321 | 0xFF, | ||
| 322 | 0x0, | ||
| 323 | 0xFF, | ||
| 324 | 0x0, | ||
| 325 | 0xFF, | ||
| 326 | 0x0, | ||
| 327 | 0xFF, | ||
| 328 | 0x0, | ||
| 329 | 0xFF); | ||
| 330 | // get some SIMD registers to play with. | ||
| 331 | __m256i r_frame0, r_temp0, shifted; | ||
| 332 | __m128i r_temp2, r_frame2, shifted2; | ||
| 333 | { | ||
| 334 | __m256i r_frame1, r_temp1; | ||
| 335 | __m128i r_frame3, r_temp3; | ||
| 336 | 256 | const __m256i shuffle_separate = _mm256_setr_epi8(0, | |
| 337 | 2, | ||
| 338 | 4, | ||
| 339 | 6, | ||
| 340 | 8, | ||
| 341 | 10, | ||
| 342 | 12, | ||
| 343 | 14, | ||
| 344 | 1, | ||
| 345 | 3, | ||
| 346 | 5, | ||
| 347 | 7, | ||
| 348 | 9, | ||
| 349 | 11, | ||
| 350 | 13, | ||
| 351 | 15, | ||
| 352 | 0, | ||
| 353 | 2, | ||
| 354 | 4, | ||
| 355 | 6, | ||
| 356 | 8, | ||
| 357 | 10, | ||
| 358 | 12, | ||
| 359 | 14, | ||
| 360 | 1, | ||
| 361 | 3, | ||
| 362 | 5, | ||
| 363 | 7, | ||
| 364 | 9, | ||
| 365 | 11, | ||
| 366 | 13, | ||
| 367 | 15); | ||
| 368 | const __m128i shuffle_separate128 = | ||
| 369 | 256 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
| 370 | |||
| 371 |
2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 256 times.
|
774 | while (stage > 4) { |
| 372 | 518 | frame_ptr = frame; | |
| 373 | 518 | temp_ptr = temp; | |
| 374 | |||
| 375 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
| 376 |
2/2✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.
|
10246 | for (branch = 0; branch < num_branches; ++branch) { |
| 377 |
2/2✓ Branch 0 taken 28864 times.
✓ Branch 1 taken 4736 times.
|
33600 | for (bit = 0; bit < frame_half; bit += 32) { |
| 378 |
2/2✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 23872 times.
|
28864 | if ((frame_half - bit) < |
| 379 | 32) // if only 16 bits remaining in frame, not 32 | ||
| 380 | { | ||
| 381 | 4992 | r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr); | |
| 382 | 4992 | temp_ptr += 16; | |
| 383 | 4992 | r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr); | |
| 384 | 4992 | temp_ptr += 16; | |
| 385 | |||
| 386 | 4992 | shifted2 = _mm_srli_si128(r_temp2, 1); | |
| 387 | 4992 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
| 388 | 4992 | r_temp2 = _mm_xor_si128(shifted2, r_temp2); | |
| 389 | 4992 | r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); | |
| 390 | |||
| 391 | 4992 | shifted2 = _mm_srli_si128(r_temp3, 1); | |
| 392 | 4992 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
| 393 | 4992 | r_temp3 = _mm_xor_si128(shifted2, r_temp3); | |
| 394 | 4992 | r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); | |
| 395 | |||
| 396 | 4992 | r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); | |
| 397 | _mm_storeu_si128((__m128i*)frame_ptr, r_frame2); | ||
| 398 | |||
| 399 | 4992 | r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); | |
| 400 | 4992 | _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3); | |
| 401 | 4992 | frame_ptr += 16; | |
| 402 | 4992 | break; | |
| 403 | } | ||
| 404 | 23872 | r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); | |
| 405 | 23872 | temp_ptr += 32; | |
| 406 | 23872 | r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr); | |
| 407 | 23872 | temp_ptr += 32; | |
| 408 | |||
| 409 | 23872 | shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes | |
| 410 | 23872 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
| 411 | 23872 | r_temp0 = _mm256_xor_si256(shifted, r_temp0); | |
| 412 | 23872 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); | |
| 413 | |||
| 414 | 23872 | shifted = _mm256_srli_si256(r_temp1, 1); | |
| 415 | 23872 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
| 416 | 23872 | r_temp1 = _mm256_xor_si256(shifted, r_temp1); | |
| 417 | 23872 | r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); | |
| 418 | |||
| 419 | 23872 | r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); | |
| 420 | 23872 | r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); | |
| 421 | 23872 | r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); | |
| 422 | 23872 | r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); | |
| 423 | |||
| 424 | _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); | ||
| 425 | |||
| 426 | 23872 | _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1); | |
| 427 | 23872 | frame_ptr += 32; | |
| 428 | } | ||
| 429 | |||
| 430 | 9728 | frame_ptr += frame_half; | |
| 431 | } | ||
| 432 | 518 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
| 433 | |||
| 434 | 518 | num_branches = num_branches << 1; | |
| 435 | 518 | frame_half = frame_half >> 1; | |
| 436 | 518 | stage--; | |
| 437 | } | ||
| 438 | } | ||
| 439 | |||
| 440 | // This last part requires at least 32-bit frames. | ||
| 441 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
| 442 | |||
| 443 | // reset pointers to correct positions. | ||
| 444 | 256 | frame_ptr = frame; | |
| 445 | 256 | temp_ptr = temp; | |
| 446 | |||
| 447 | // prefetch first chunk | ||
| 448 | 256 | __VOLK_PREFETCH(temp_ptr); | |
| 449 | |||
| 450 | 256 | const __m256i shuffle_stage4 = _mm256_setr_epi8(0, | |
| 451 | 8, | ||
| 452 | 4, | ||
| 453 | 12, | ||
| 454 | 2, | ||
| 455 | 10, | ||
| 456 | 6, | ||
| 457 | 14, | ||
| 458 | 1, | ||
| 459 | 9, | ||
| 460 | 5, | ||
| 461 | 13, | ||
| 462 | 3, | ||
| 463 | 11, | ||
| 464 | 7, | ||
| 465 | 15, | ||
| 466 | 0, | ||
| 467 | 8, | ||
| 468 | 4, | ||
| 469 | 12, | ||
| 470 | 2, | ||
| 471 | 10, | ||
| 472 | 6, | ||
| 473 | 14, | ||
| 474 | 1, | ||
| 475 | 9, | ||
| 476 | 5, | ||
| 477 | 13, | ||
| 478 | 3, | ||
| 479 | 11, | ||
| 480 | 7, | ||
| 481 | 15); | ||
| 482 | 256 | const __m256i mask_stage4 = _mm256_set_epi8(0x0, | |
| 483 | 0x0, | ||
| 484 | 0x0, | ||
| 485 | 0x0, | ||
| 486 | 0x0, | ||
| 487 | 0x0, | ||
| 488 | 0x0, | ||
| 489 | 0x0, | ||
| 490 | 0xFF, | ||
| 491 | 0xFF, | ||
| 492 | 0xFF, | ||
| 493 | 0xFF, | ||
| 494 | 0xFF, | ||
| 495 | 0xFF, | ||
| 496 | 0xFF, | ||
| 497 | 0xFF, | ||
| 498 | 0x0, | ||
| 499 | 0x0, | ||
| 500 | 0x0, | ||
| 501 | 0x0, | ||
| 502 | 0x0, | ||
| 503 | 0x0, | ||
| 504 | 0x0, | ||
| 505 | 0x0, | ||
| 506 | 0xFF, | ||
| 507 | 0xFF, | ||
| 508 | 0xFF, | ||
| 509 | 0xFF, | ||
| 510 | 0xFF, | ||
| 511 | 0xFF, | ||
| 512 | 0xFF, | ||
| 513 | 0xFF); | ||
| 514 | 256 | const __m256i mask_stage3 = _mm256_set_epi8(0x0, | |
| 515 | 0x0, | ||
| 516 | 0x0, | ||
| 517 | 0x0, | ||
| 518 | 0xFF, | ||
| 519 | 0xFF, | ||
| 520 | 0xFF, | ||
| 521 | 0xFF, | ||
| 522 | 0x0, | ||
| 523 | 0x0, | ||
| 524 | 0x0, | ||
| 525 | 0x0, | ||
| 526 | 0xFF, | ||
| 527 | 0xFF, | ||
| 528 | 0xFF, | ||
| 529 | 0xFF, | ||
| 530 | 0x0, | ||
| 531 | 0x0, | ||
| 532 | 0x0, | ||
| 533 | 0x0, | ||
| 534 | 0xFF, | ||
| 535 | 0xFF, | ||
| 536 | 0xFF, | ||
| 537 | 0xFF, | ||
| 538 | 0x0, | ||
| 539 | 0x0, | ||
| 540 | 0x0, | ||
| 541 | 0x0, | ||
| 542 | 0xFF, | ||
| 543 | 0xFF, | ||
| 544 | 0xFF, | ||
| 545 | 0xFF); | ||
| 546 | 256 | const __m256i mask_stage2 = _mm256_set_epi8(0x0, | |
| 547 | 0x0, | ||
| 548 | 0xFF, | ||
| 549 | 0xFF, | ||
| 550 | 0x0, | ||
| 551 | 0x0, | ||
| 552 | 0xFF, | ||
| 553 | 0xFF, | ||
| 554 | 0x0, | ||
| 555 | 0x0, | ||
| 556 | 0xFF, | ||
| 557 | 0xFF, | ||
| 558 | 0x0, | ||
| 559 | 0x0, | ||
| 560 | 0xFF, | ||
| 561 | 0xFF, | ||
| 562 | 0x0, | ||
| 563 | 0x0, | ||
| 564 | 0xFF, | ||
| 565 | 0xFF, | ||
| 566 | 0x0, | ||
| 567 | 0x0, | ||
| 568 | 0xFF, | ||
| 569 | 0xFF, | ||
| 570 | 0x0, | ||
| 571 | 0x0, | ||
| 572 | 0xFF, | ||
| 573 | 0xFF, | ||
| 574 | 0x0, | ||
| 575 | 0x0, | ||
| 576 | 0xFF, | ||
| 577 | 0xFF); | ||
| 578 | |||
| 579 |
2/2✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 256 times.
|
5248 | for (branch = 0; branch < num_branches / 2; ++branch) { |
| 580 | 4992 | r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); | |
| 581 | |||
| 582 | // prefetch next chunk | ||
| 583 | 4992 | temp_ptr += 32; | |
| 584 | 4992 | __VOLK_PREFETCH(temp_ptr); | |
| 585 | |||
| 586 | // shuffle once for bit-reversal. | ||
| 587 | 4992 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); | |
| 588 | |||
| 589 | 4992 | shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes | |
| 590 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage4); | |
| 591 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_temp0); | |
| 592 | |||
| 593 | |||
| 594 | 4992 | shifted = _mm256_srli_si256(r_frame0, 4); | |
| 595 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage3); | |
| 596 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
| 597 | |||
| 598 | 4992 | shifted = _mm256_srli_si256(r_frame0, 2); | |
| 599 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage2); | |
| 600 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
| 601 | |||
| 602 | 4992 | shifted = _mm256_srli_si256(r_frame0, 1); | |
| 603 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
| 604 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
| 605 | |||
| 606 | // store result of chunk. | ||
| 607 | _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); | ||
| 608 | 4992 | frame_ptr += 32; | |
| 609 | } | ||
| 610 | } | ||
| 611 | #endif /* LV_HAVE_AVX2 */ | ||
| 612 | |||
| 613 | #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */ | ||
| 614 | |||
| 615 | #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ | ||
| 616 | #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ | ||
| 617 | |||
| 618 | #ifdef LV_HAVE_SSSE3 | ||
| 619 | #include <tmmintrin.h> | ||
| 620 | |||
| 621 | 2 | static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, | |
| 622 | unsigned char* temp, | ||
| 623 | unsigned int frame_size) | ||
| 624 | { | ||
| 625 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (frame_size < 16) { |
| 626 | ✗ | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
| 627 | ✗ | return; | |
| 628 | } | ||
| 629 | |||
| 630 | 2 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
| 631 | |||
| 632 | 2 | unsigned int stage = po2; | |
| 633 | 2 | unsigned char* frame_ptr = frame; | |
| 634 | 2 | unsigned char* temp_ptr = temp; | |
| 635 | |||
| 636 | 2 | unsigned int frame_half = frame_size >> 1; | |
| 637 | 2 | unsigned int num_branches = 1; | |
| 638 | unsigned int branch; | ||
| 639 | unsigned int bit; | ||
| 640 | |||
| 641 | // prepare constants | ||
| 642 | 2 | const __m128i mask_stage1 = _mm_set_epi8(0x0, | |
| 643 | 0xFF, | ||
| 644 | 0x0, | ||
| 645 | 0xFF, | ||
| 646 | 0x0, | ||
| 647 | 0xFF, | ||
| 648 | 0x0, | ||
| 649 | 0xFF, | ||
| 650 | 0x0, | ||
| 651 | 0xFF, | ||
| 652 | 0x0, | ||
| 653 | 0xFF, | ||
| 654 | 0x0, | ||
| 655 | 0xFF, | ||
| 656 | 0x0, | ||
| 657 | 0xFF); | ||
| 658 | |||
| 659 | // get some SIMD registers to play with. | ||
| 660 | __m128i r_frame0, r_temp0, shifted; | ||
| 661 | |||
| 662 | { | ||
| 663 | __m128i r_frame1, r_temp1; | ||
| 664 | const __m128i shuffle_separate = | ||
| 665 | 2 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
| 666 | |||
| 667 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
|
26 | while (stage > 4) { |
| 668 | 24 | frame_ptr = frame; | |
| 669 | 24 | temp_ptr = temp; | |
| 670 | |||
| 671 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
| 672 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.
|
8214 | for (branch = 0; branch < num_branches; ++branch) { |
| 673 |
2/2✓ Branch 0 taken 49152 times.
✓ Branch 1 taken 8190 times.
|
57342 | for (bit = 0; bit < frame_half; bit += 16) { |
| 674 | 49152 | r_temp0 = _mm_load_si128((__m128i*)temp_ptr); | |
| 675 | 49152 | temp_ptr += 16; | |
| 676 | 49152 | r_temp1 = _mm_load_si128((__m128i*)temp_ptr); | |
| 677 | 49152 | temp_ptr += 16; | |
| 678 | |||
| 679 | 49152 | shifted = _mm_srli_si128(r_temp0, 1); | |
| 680 | 49152 | shifted = _mm_and_si128(shifted, mask_stage1); | |
| 681 | 49152 | r_temp0 = _mm_xor_si128(shifted, r_temp0); | |
| 682 | 49152 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); | |
| 683 | |||
| 684 | 49152 | shifted = _mm_srli_si128(r_temp1, 1); | |
| 685 | 49152 | shifted = _mm_and_si128(shifted, mask_stage1); | |
| 686 | 49152 | r_temp1 = _mm_xor_si128(shifted, r_temp1); | |
| 687 | 49152 | r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); | |
| 688 | |||
| 689 | 49152 | r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); | |
| 690 | _mm_store_si128((__m128i*)frame_ptr, r_frame0); | ||
| 691 | |||
| 692 | 49152 | r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); | |
| 693 | 49152 | _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1); | |
| 694 | 49152 | frame_ptr += 16; | |
| 695 | } | ||
| 696 | |||
| 697 | 8190 | frame_ptr += frame_half; | |
| 698 | } | ||
| 699 | 24 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
| 700 | |||
| 701 | 24 | num_branches = num_branches << 1; | |
| 702 | 24 | frame_half = frame_half >> 1; | |
| 703 | 24 | stage--; | |
| 704 | } | ||
| 705 | } | ||
| 706 | |||
| 707 | // This last part requires at least 16-bit frames. | ||
| 708 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
| 709 | |||
| 710 | // reset pointers to correct positions. | ||
| 711 | 2 | frame_ptr = frame; | |
| 712 | 2 | temp_ptr = temp; | |
| 713 | |||
| 714 | // prefetch first chunk | ||
| 715 | 2 | __VOLK_PREFETCH(temp_ptr); | |
| 716 | |||
| 717 | const __m128i shuffle_stage4 = | ||
| 718 | 2 | _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); | |
| 719 | 2 | const __m128i mask_stage4 = _mm_set_epi8(0x0, | |
| 720 | 0x0, | ||
| 721 | 0x0, | ||
| 722 | 0x0, | ||
| 723 | 0x0, | ||
| 724 | 0x0, | ||
| 725 | 0x0, | ||
| 726 | 0x0, | ||
| 727 | 0xFF, | ||
| 728 | 0xFF, | ||
| 729 | 0xFF, | ||
| 730 | 0xFF, | ||
| 731 | 0xFF, | ||
| 732 | 0xFF, | ||
| 733 | 0xFF, | ||
| 734 | 0xFF); | ||
| 735 | 2 | const __m128i mask_stage3 = _mm_set_epi8(0x0, | |
| 736 | 0x0, | ||
| 737 | 0x0, | ||
| 738 | 0x0, | ||
| 739 | 0xFF, | ||
| 740 | 0xFF, | ||
| 741 | 0xFF, | ||
| 742 | 0xFF, | ||
| 743 | 0x0, | ||
| 744 | 0x0, | ||
| 745 | 0x0, | ||
| 746 | 0x0, | ||
| 747 | 0xFF, | ||
| 748 | 0xFF, | ||
| 749 | 0xFF, | ||
| 750 | 0xFF); | ||
| 751 | 2 | const __m128i mask_stage2 = _mm_set_epi8(0x0, | |
| 752 | 0x0, | ||
| 753 | 0xFF, | ||
| 754 | 0xFF, | ||
| 755 | 0x0, | ||
| 756 | 0x0, | ||
| 757 | 0xFF, | ||
| 758 | 0xFF, | ||
| 759 | 0x0, | ||
| 760 | 0x0, | ||
| 761 | 0xFF, | ||
| 762 | 0xFF, | ||
| 763 | 0x0, | ||
| 764 | 0x0, | ||
| 765 | 0xFF, | ||
| 766 | 0xFF); | ||
| 767 | |||
| 768 |
2/2✓ Branch 0 taken 8192 times.
✓ Branch 1 taken 2 times.
|
8194 | for (branch = 0; branch < num_branches; ++branch) { |
| 769 | 8192 | r_temp0 = _mm_load_si128((__m128i*)temp_ptr); | |
| 770 | |||
| 771 | // prefetch next chunk | ||
| 772 | 8192 | temp_ptr += 16; | |
| 773 | 8192 | __VOLK_PREFETCH(temp_ptr); | |
| 774 | |||
| 775 | // shuffle once for bit-reversal. | ||
| 776 | 8192 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); | |
| 777 | |||
| 778 | 8192 | shifted = _mm_srli_si128(r_temp0, 8); | |
| 779 | 8192 | shifted = _mm_and_si128(shifted, mask_stage4); | |
| 780 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_temp0); | |
| 781 | |||
| 782 | 8192 | shifted = _mm_srli_si128(r_frame0, 4); | |
| 783 | 8192 | shifted = _mm_and_si128(shifted, mask_stage3); | |
| 784 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
| 785 | |||
| 786 | 8192 | shifted = _mm_srli_si128(r_frame0, 2); | |
| 787 | 8192 | shifted = _mm_and_si128(shifted, mask_stage2); | |
| 788 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
| 789 | |||
| 790 | 8192 | shifted = _mm_srli_si128(r_frame0, 1); | |
| 791 | 8192 | shifted = _mm_and_si128(shifted, mask_stage1); | |
| 792 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
| 793 | |||
| 794 | // store result of chunk. | ||
| 795 | _mm_store_si128((__m128i*)frame_ptr, r_frame0); | ||
| 796 | 8192 | frame_ptr += 16; | |
| 797 | } | ||
| 798 | } | ||
| 799 | #endif /* LV_HAVE_SSSE3 */ | ||
| 800 | |||
| 801 | #ifdef LV_HAVE_AVX2 | ||
| 802 | #include <immintrin.h> | ||
| 803 | |||
| 804 | 2 | static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, | |
| 805 | unsigned char* temp, | ||
| 806 | unsigned int frame_size) | ||
| 807 | { | ||
| 808 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (frame_size < 32) { |
| 809 | ✗ | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
| 810 | ✗ | return; | |
| 811 | } | ||
| 812 | |||
| 813 | 2 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
| 814 | |||
| 815 | 2 | unsigned int stage = po2; | |
| 816 | 2 | unsigned char* frame_ptr = frame; | |
| 817 | 2 | unsigned char* temp_ptr = temp; | |
| 818 | |||
| 819 | 2 | unsigned int frame_half = frame_size >> 1; | |
| 820 | 2 | unsigned int num_branches = 1; | |
| 821 | unsigned int branch; | ||
| 822 | unsigned int bit; | ||
| 823 | |||
| 824 | // prepare constants | ||
| 825 | 2 | const __m256i mask_stage1 = _mm256_set_epi8(0x0, | |
| 826 | 0xFF, | ||
| 827 | 0x0, | ||
| 828 | 0xFF, | ||
| 829 | 0x0, | ||
| 830 | 0xFF, | ||
| 831 | 0x0, | ||
| 832 | 0xFF, | ||
| 833 | 0x0, | ||
| 834 | 0xFF, | ||
| 835 | 0x0, | ||
| 836 | 0xFF, | ||
| 837 | 0x0, | ||
| 838 | 0xFF, | ||
| 839 | 0x0, | ||
| 840 | 0xFF, | ||
| 841 | 0x0, | ||
| 842 | 0xFF, | ||
| 843 | 0x0, | ||
| 844 | 0xFF, | ||
| 845 | 0x0, | ||
| 846 | 0xFF, | ||
| 847 | 0x0, | ||
| 848 | 0xFF, | ||
| 849 | 0x0, | ||
| 850 | 0xFF, | ||
| 851 | 0x0, | ||
| 852 | 0xFF, | ||
| 853 | 0x0, | ||
| 854 | 0xFF, | ||
| 855 | 0x0, | ||
| 856 | 0xFF); | ||
| 857 | |||
| 858 | 2 | const __m128i mask_stage0 = _mm_set_epi8(0x0, | |
| 859 | 0xFF, | ||
| 860 | 0x0, | ||
| 861 | 0xFF, | ||
| 862 | 0x0, | ||
| 863 | 0xFF, | ||
| 864 | 0x0, | ||
| 865 | 0xFF, | ||
| 866 | 0x0, | ||
| 867 | 0xFF, | ||
| 868 | 0x0, | ||
| 869 | 0xFF, | ||
| 870 | 0x0, | ||
| 871 | 0xFF, | ||
| 872 | 0x0, | ||
| 873 | 0xFF); | ||
| 874 | // get some SIMD registers to play with. | ||
| 875 | __m256i r_frame0, r_temp0, shifted; | ||
| 876 | __m128i r_temp2, r_frame2, shifted2; | ||
| 877 | { | ||
| 878 | __m256i r_frame1, r_temp1; | ||
| 879 | __m128i r_frame3, r_temp3; | ||
| 880 | 2 | const __m256i shuffle_separate = _mm256_setr_epi8(0, | |
| 881 | 2, | ||
| 882 | 4, | ||
| 883 | 6, | ||
| 884 | 8, | ||
| 885 | 10, | ||
| 886 | 12, | ||
| 887 | 14, | ||
| 888 | 1, | ||
| 889 | 3, | ||
| 890 | 5, | ||
| 891 | 7, | ||
| 892 | 9, | ||
| 893 | 11, | ||
| 894 | 13, | ||
| 895 | 15, | ||
| 896 | 0, | ||
| 897 | 2, | ||
| 898 | 4, | ||
| 899 | 6, | ||
| 900 | 8, | ||
| 901 | 10, | ||
| 902 | 12, | ||
| 903 | 14, | ||
| 904 | 1, | ||
| 905 | 3, | ||
| 906 | 5, | ||
| 907 | 7, | ||
| 908 | 9, | ||
| 909 | 11, | ||
| 910 | 13, | ||
| 911 | 15); | ||
| 912 | const __m128i shuffle_separate128 = | ||
| 913 | 2 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
| 914 | |||
| 915 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
|
26 | while (stage > 4) { |
| 916 | 24 | frame_ptr = frame; | |
| 917 | 24 | temp_ptr = temp; | |
| 918 | |||
| 919 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
| 920 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.
|
8214 | for (branch = 0; branch < num_branches; ++branch) { |
| 921 |
2/2✓ Branch 0 taken 26624 times.
✓ Branch 1 taken 4094 times.
|
30718 | for (bit = 0; bit < frame_half; bit += 32) { |
| 922 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 22528 times.
|
26624 | if ((frame_half - bit) < |
| 923 | 32) // if only 16 bits remaining in frame, not 32 | ||
| 924 | { | ||
| 925 | 4096 | r_temp2 = _mm_load_si128((__m128i*)temp_ptr); | |
| 926 | 4096 | temp_ptr += 16; | |
| 927 | 4096 | r_temp3 = _mm_load_si128((__m128i*)temp_ptr); | |
| 928 | 4096 | temp_ptr += 16; | |
| 929 | |||
| 930 | 4096 | shifted2 = _mm_srli_si128(r_temp2, 1); | |
| 931 | 4096 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
| 932 | 4096 | r_temp2 = _mm_xor_si128(shifted2, r_temp2); | |
| 933 | 4096 | r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); | |
| 934 | |||
| 935 | 4096 | shifted2 = _mm_srli_si128(r_temp3, 1); | |
| 936 | 4096 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
| 937 | 4096 | r_temp3 = _mm_xor_si128(shifted2, r_temp3); | |
| 938 | 4096 | r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); | |
| 939 | |||
| 940 | 4096 | r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); | |
| 941 | _mm_store_si128((__m128i*)frame_ptr, r_frame2); | ||
| 942 | |||
| 943 | 4096 | r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); | |
| 944 | 4096 | _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3); | |
| 945 | 4096 | frame_ptr += 16; | |
| 946 | 4096 | break; | |
| 947 | } | ||
| 948 | 22528 | r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); | |
| 949 | 22528 | temp_ptr += 32; | |
| 950 | 22528 | r_temp1 = _mm256_load_si256((__m256i*)temp_ptr); | |
| 951 | 22528 | temp_ptr += 32; | |
| 952 | |||
| 953 | 22528 | shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes | |
| 954 | 22528 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
| 955 | 22528 | r_temp0 = _mm256_xor_si256(shifted, r_temp0); | |
| 956 | 22528 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); | |
| 957 | |||
| 958 | 22528 | shifted = _mm256_srli_si256(r_temp1, 1); | |
| 959 | 22528 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
| 960 | 22528 | r_temp1 = _mm256_xor_si256(shifted, r_temp1); | |
| 961 | 22528 | r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); | |
| 962 | |||
| 963 | 22528 | r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); | |
| 964 | 22528 | r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); | |
| 965 | 22528 | r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); | |
| 966 | 22528 | r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); | |
| 967 | |||
| 968 | _mm256_store_si256((__m256i*)frame_ptr, r_frame0); | ||
| 969 | |||
| 970 | 22528 | _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1); | |
| 971 | 22528 | frame_ptr += 32; | |
| 972 | } | ||
| 973 | |||
| 974 | 8190 | frame_ptr += frame_half; | |
| 975 | } | ||
| 976 | 24 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
| 977 | |||
| 978 | 24 | num_branches = num_branches << 1; | |
| 979 | 24 | frame_half = frame_half >> 1; | |
| 980 | 24 | stage--; | |
| 981 | } | ||
| 982 | } | ||
| 983 | |||
| 984 | // This last part requires at least 32-bit frames. | ||
| 985 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
| 986 | |||
| 987 | // reset pointers to correct positions. | ||
| 988 | 2 | frame_ptr = frame; | |
| 989 | 2 | temp_ptr = temp; | |
| 990 | |||
| 991 | // prefetch first chunk. | ||
| 992 | 2 | __VOLK_PREFETCH(temp_ptr); | |
| 993 | |||
| 994 | 2 | const __m256i shuffle_stage4 = _mm256_setr_epi8(0, | |
| 995 | 8, | ||
| 996 | 4, | ||
| 997 | 12, | ||
| 998 | 2, | ||
| 999 | 10, | ||
| 1000 | 6, | ||
| 1001 | 14, | ||
| 1002 | 1, | ||
| 1003 | 9, | ||
| 1004 | 5, | ||
| 1005 | 13, | ||
| 1006 | 3, | ||
| 1007 | 11, | ||
| 1008 | 7, | ||
| 1009 | 15, | ||
| 1010 | 0, | ||
| 1011 | 8, | ||
| 1012 | 4, | ||
| 1013 | 12, | ||
| 1014 | 2, | ||
| 1015 | 10, | ||
| 1016 | 6, | ||
| 1017 | 14, | ||
| 1018 | 1, | ||
| 1019 | 9, | ||
| 1020 | 5, | ||
| 1021 | 13, | ||
| 1022 | 3, | ||
| 1023 | 11, | ||
| 1024 | 7, | ||
| 1025 | 15); | ||
| 1026 | 2 | const __m256i mask_stage4 = _mm256_set_epi8(0x0, | |
| 1027 | 0x0, | ||
| 1028 | 0x0, | ||
| 1029 | 0x0, | ||
| 1030 | 0x0, | ||
| 1031 | 0x0, | ||
| 1032 | 0x0, | ||
| 1033 | 0x0, | ||
| 1034 | 0xFF, | ||
| 1035 | 0xFF, | ||
| 1036 | 0xFF, | ||
| 1037 | 0xFF, | ||
| 1038 | 0xFF, | ||
| 1039 | 0xFF, | ||
| 1040 | 0xFF, | ||
| 1041 | 0xFF, | ||
| 1042 | 0x0, | ||
| 1043 | 0x0, | ||
| 1044 | 0x0, | ||
| 1045 | 0x0, | ||
| 1046 | 0x0, | ||
| 1047 | 0x0, | ||
| 1048 | 0x0, | ||
| 1049 | 0x0, | ||
| 1050 | 0xFF, | ||
| 1051 | 0xFF, | ||
| 1052 | 0xFF, | ||
| 1053 | 0xFF, | ||
| 1054 | 0xFF, | ||
| 1055 | 0xFF, | ||
| 1056 | 0xFF, | ||
| 1057 | 0xFF); | ||
| 1058 | 2 | const __m256i mask_stage3 = _mm256_set_epi8(0x0, | |
| 1059 | 0x0, | ||
| 1060 | 0x0, | ||
| 1061 | 0x0, | ||
| 1062 | 0xFF, | ||
| 1063 | 0xFF, | ||
| 1064 | 0xFF, | ||
| 1065 | 0xFF, | ||
| 1066 | 0x0, | ||
| 1067 | 0x0, | ||
| 1068 | 0x0, | ||
| 1069 | 0x0, | ||
| 1070 | 0xFF, | ||
| 1071 | 0xFF, | ||
| 1072 | 0xFF, | ||
| 1073 | 0xFF, | ||
| 1074 | 0x0, | ||
| 1075 | 0x0, | ||
| 1076 | 0x0, | ||
| 1077 | 0x0, | ||
| 1078 | 0xFF, | ||
| 1079 | 0xFF, | ||
| 1080 | 0xFF, | ||
| 1081 | 0xFF, | ||
| 1082 | 0x0, | ||
| 1083 | 0x0, | ||
| 1084 | 0x0, | ||
| 1085 | 0x0, | ||
| 1086 | 0xFF, | ||
| 1087 | 0xFF, | ||
| 1088 | 0xFF, | ||
| 1089 | 0xFF); | ||
| 1090 | 2 | const __m256i mask_stage2 = _mm256_set_epi8(0x0, | |
| 1091 | 0x0, | ||
| 1092 | 0xFF, | ||
| 1093 | 0xFF, | ||
| 1094 | 0x0, | ||
| 1095 | 0x0, | ||
| 1096 | 0xFF, | ||
| 1097 | 0xFF, | ||
| 1098 | 0x0, | ||
| 1099 | 0x0, | ||
| 1100 | 0xFF, | ||
| 1101 | 0xFF, | ||
| 1102 | 0x0, | ||
| 1103 | 0x0, | ||
| 1104 | 0xFF, | ||
| 1105 | 0xFF, | ||
| 1106 | 0x0, | ||
| 1107 | 0x0, | ||
| 1108 | 0xFF, | ||
| 1109 | 0xFF, | ||
| 1110 | 0x0, | ||
| 1111 | 0x0, | ||
| 1112 | 0xFF, | ||
| 1113 | 0xFF, | ||
| 1114 | 0x0, | ||
| 1115 | 0x0, | ||
| 1116 | 0xFF, | ||
| 1117 | 0xFF, | ||
| 1118 | 0x0, | ||
| 1119 | 0x0, | ||
| 1120 | 0xFF, | ||
| 1121 | 0xFF); | ||
| 1122 | |||
| 1123 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 2 times.
|
4098 | for (branch = 0; branch < num_branches / 2; ++branch) { |
| 1124 | 4096 | r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); | |
| 1125 | |||
| 1126 | // prefetch next chunk | ||
| 1127 | 4096 | temp_ptr += 32; | |
| 1128 | 4096 | __VOLK_PREFETCH(temp_ptr); | |
| 1129 | |||
| 1130 | // shuffle once for bit-reversal. | ||
| 1131 | 4096 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); | |
| 1132 | |||
| 1133 | 4096 | shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes | |
| 1134 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage4); | |
| 1135 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_temp0); | |
| 1136 | |||
| 1137 | 4096 | shifted = _mm256_srli_si256(r_frame0, 4); | |
| 1138 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage3); | |
| 1139 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
| 1140 | |||
| 1141 | 4096 | shifted = _mm256_srli_si256(r_frame0, 2); | |
| 1142 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage2); | |
| 1143 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
| 1144 | |||
| 1145 | 4096 | shifted = _mm256_srli_si256(r_frame0, 1); | |
| 1146 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
| 1147 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
| 1148 | |||
| 1149 | // store result of chunk. | ||
| 1150 | _mm256_store_si256((__m256i*)frame_ptr, r_frame0); | ||
| 1151 | 4096 | frame_ptr += 32; | |
| 1152 | } | ||
| 1153 | } | ||
| 1154 | #endif /* LV_HAVE_AVX2 */ | ||
| 1155 | |||
| 1156 | |||
| 1157 | #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */ | ||
| 1158 |