GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_8u_x2_encodeframepolar_8u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 319 323 98.8%
Functions: 7 7 100.0%
Branches: 56 58 96.6%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*
11 * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
12 */
13
14 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
16 #include <string.h>
17
18 2072 static inline unsigned int log2_of_power_of_2(unsigned int val)
19 {
20 // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
23 };
24
25 2072 unsigned int res = (val & b[0]) != 0;
26
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2056 times.
2072 res |= ((val & b[4]) != 0) << 4;
27
2/2
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 2000 times.
2072 res |= ((val & b[3]) != 0) << 3;
28
2/2
✓ Branch 0 taken 972 times.
✓ Branch 1 taken 1100 times.
2072 res |= ((val & b[2]) != 0) << 2;
29
2/2
✓ Branch 0 taken 1228 times.
✓ Branch 1 taken 844 times.
2072 res |= ((val & b[1]) != 0) << 1;
30 2072 return res;
31 }
32
33 4200 static inline void encodepolar_single_stage(unsigned char* frame_ptr,
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
37 {
38 unsigned int branch, bit;
39
2/2
✓ Branch 0 taken 166648 times.
✓ Branch 1 taken 4200 times.
170848 for (branch = 0; branch < num_branches; ++branch) {
40
2/2
✓ Branch 0 taken 1216512 times.
✓ Branch 1 taken 166648 times.
1383160 for (bit = 0; bit < frame_half; ++bit) {
41 1216512 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 1216512 *(frame_ptr + frame_half) = *(temp_ptr + 1);
43 1216512 ++frame_ptr;
44 1216512 temp_ptr += 2;
45 }
46 166648 frame_ptr += frame_half;
47 }
48 4200 }
49
50 #ifdef LV_HAVE_GENERIC
51
52 1288 static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
53 unsigned char* temp,
54 unsigned int frame_size)
55 {
56 1288 unsigned int stage = log2_of_power_of_2(frame_size);
57 1288 unsigned int frame_half = frame_size >> 1;
58 1288 unsigned int num_branches = 1;
59
60
2/2
✓ Branch 0 taken 4200 times.
✓ Branch 1 taken 1288 times.
5488 while (stage) {
61 // encode stage
62 4200 encodepolar_single_stage(frame, temp, num_branches, frame_half);
63 4200 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
64
65 // update all the parameters.
66 4200 num_branches = num_branches << 1;
67 4200 frame_half = frame_half >> 1;
68 4200 --stage;
69 }
70 1288 }
71 #endif /* LV_HAVE_GENERIC */
72
73 #ifdef LV_HAVE_SSSE3
74 #include <tmmintrin.h>
75
76 1024 static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
77 unsigned char* temp,
78 unsigned int frame_size)
79 {
80
2/2
✓ Branch 0 taken 512 times.
✓ Branch 1 taken 512 times.
1024 if (frame_size < 16) {
81 512 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
82 512 return;
83 }
84
85 512 const unsigned int po2 = log2_of_power_of_2(frame_size);
86
87 512 unsigned int stage = po2;
88 512 unsigned char* frame_ptr = frame;
89 512 unsigned char* temp_ptr = temp;
90
91 512 unsigned int frame_half = frame_size >> 1;
92 512 unsigned int num_branches = 1;
93 unsigned int branch;
94 unsigned int bit;
95
96 // prepare constants
97 512 const __m128i mask_stage1 = _mm_set_epi8(0x0,
98 0xFF,
99 0x0,
100 0xFF,
101 0x0,
102 0xFF,
103 0x0,
104 0xFF,
105 0x0,
106 0xFF,
107 0x0,
108 0xFF,
109 0x0,
110 0xFF,
111 0x0,
112 0xFF);
113
114 // get some SIMD registers to play with.
115 __m128i r_frame0, r_temp0, shifted;
116
117 {
118 __m128i r_frame1, r_temp1;
119 const __m128i shuffle_separate =
120 512 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
121
122
2/2
✓ Branch 0 taken 518 times.
✓ Branch 1 taken 512 times.
1030 while (stage > 4) {
123 518 frame_ptr = frame;
124 518 temp_ptr = temp;
125
126 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
127
2/2
✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.
10246 for (branch = 0; branch < num_branches; ++branch) {
128
2/2
✓ Branch 0 taken 52736 times.
✓ Branch 1 taken 9728 times.
62464 for (bit = 0; bit < frame_half; bit += 16) {
129 52736 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
130 52736 temp_ptr += 16;
131 52736 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
132 52736 temp_ptr += 16;
133
134 52736 shifted = _mm_srli_si128(r_temp0, 1);
135 52736 shifted = _mm_and_si128(shifted, mask_stage1);
136 52736 r_temp0 = _mm_xor_si128(shifted, r_temp0);
137 52736 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
138
139 52736 shifted = _mm_srli_si128(r_temp1, 1);
140 52736 shifted = _mm_and_si128(shifted, mask_stage1);
141 52736 r_temp1 = _mm_xor_si128(shifted, r_temp1);
142 52736 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
143
144 52736 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
145 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
146
147 52736 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
148 52736 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
149 52736 frame_ptr += 16;
150 }
151
152 9728 frame_ptr += frame_half;
153 }
154 518 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
155
156 518 num_branches = num_branches << 1;
157 518 frame_half = frame_half >> 1;
158 518 stage--;
159 }
160 }
161
162 // This last part requires at least 16-bit frames.
163 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
164
165 // reset pointers to correct positions.
166 512 frame_ptr = frame;
167 512 temp_ptr = temp;
168
169 // prefetch first chunk
170 512 __VOLK_PREFETCH(temp_ptr);
171
172 const __m128i shuffle_stage4 =
173 512 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
174 512 const __m128i mask_stage4 = _mm_set_epi8(0x0,
175 0x0,
176 0x0,
177 0x0,
178 0x0,
179 0x0,
180 0x0,
181 0x0,
182 0xFF,
183 0xFF,
184 0xFF,
185 0xFF,
186 0xFF,
187 0xFF,
188 0xFF,
189 0xFF);
190 512 const __m128i mask_stage3 = _mm_set_epi8(0x0,
191 0x0,
192 0x0,
193 0x0,
194 0xFF,
195 0xFF,
196 0xFF,
197 0xFF,
198 0x0,
199 0x0,
200 0x0,
201 0x0,
202 0xFF,
203 0xFF,
204 0xFF,
205 0xFF);
206 512 const __m128i mask_stage2 = _mm_set_epi8(0x0,
207 0x0,
208 0xFF,
209 0xFF,
210 0x0,
211 0x0,
212 0xFF,
213 0xFF,
214 0x0,
215 0x0,
216 0xFF,
217 0xFF,
218 0x0,
219 0x0,
220 0xFF,
221 0xFF);
222
223
2/2
✓ Branch 0 taken 10240 times.
✓ Branch 1 taken 512 times.
10752 for (branch = 0; branch < num_branches; ++branch) {
224 10240 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
225
226 // prefetch next chunk
227 10240 temp_ptr += 16;
228 10240 __VOLK_PREFETCH(temp_ptr);
229
230 // shuffle once for bit-reversal.
231 10240 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
232
233 10240 shifted = _mm_srli_si128(r_temp0, 8);
234 10240 shifted = _mm_and_si128(shifted, mask_stage4);
235 10240 r_frame0 = _mm_xor_si128(shifted, r_temp0);
236
237 10240 shifted = _mm_srli_si128(r_frame0, 4);
238 10240 shifted = _mm_and_si128(shifted, mask_stage3);
239 10240 r_frame0 = _mm_xor_si128(shifted, r_frame0);
240
241 10240 shifted = _mm_srli_si128(r_frame0, 2);
242 10240 shifted = _mm_and_si128(shifted, mask_stage2);
243 10240 r_frame0 = _mm_xor_si128(shifted, r_frame0);
244
245 10240 shifted = _mm_srli_si128(r_frame0, 1);
246 10240 shifted = _mm_and_si128(shifted, mask_stage1);
247 10240 r_frame0 = _mm_xor_si128(shifted, r_frame0);
248
249 // store result of chunk.
250 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
251 10240 frame_ptr += 16;
252 }
253 }
254
255 #endif /* LV_HAVE_SSSE3 */
256
257 #ifdef LV_HAVE_AVX2
258 #include <immintrin.h>
259
260 1024 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
261 unsigned char* temp,
262 unsigned int frame_size)
263 {
264
2/2
✓ Branch 0 taken 768 times.
✓ Branch 1 taken 256 times.
1024 if (frame_size < 32) {
265 768 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
266 768 return;
267 }
268
269 256 const unsigned int po2 = log2_of_power_of_2(frame_size);
270
271 256 unsigned int stage = po2;
272 256 unsigned char* frame_ptr = frame;
273 256 unsigned char* temp_ptr = temp;
274
275 256 unsigned int frame_half = frame_size >> 1;
276 256 unsigned int num_branches = 1;
277 unsigned int branch;
278 unsigned int bit;
279
280 // prepare constants
281 256 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
282 0xFF,
283 0x0,
284 0xFF,
285 0x0,
286 0xFF,
287 0x0,
288 0xFF,
289 0x0,
290 0xFF,
291 0x0,
292 0xFF,
293 0x0,
294 0xFF,
295 0x0,
296 0xFF,
297 0x0,
298 0xFF,
299 0x0,
300 0xFF,
301 0x0,
302 0xFF,
303 0x0,
304 0xFF,
305 0x0,
306 0xFF,
307 0x0,
308 0xFF,
309 0x0,
310 0xFF,
311 0x0,
312 0xFF);
313
314 256 const __m128i mask_stage0 = _mm_set_epi8(0x0,
315 0xFF,
316 0x0,
317 0xFF,
318 0x0,
319 0xFF,
320 0x0,
321 0xFF,
322 0x0,
323 0xFF,
324 0x0,
325 0xFF,
326 0x0,
327 0xFF,
328 0x0,
329 0xFF);
330 // get some SIMD registers to play with.
331 __m256i r_frame0, r_temp0, shifted;
332 __m128i r_temp2, r_frame2, shifted2;
333 {
334 __m256i r_frame1, r_temp1;
335 __m128i r_frame3, r_temp3;
336 256 const __m256i shuffle_separate = _mm256_setr_epi8(0,
337 2,
338 4,
339 6,
340 8,
341 10,
342 12,
343 14,
344 1,
345 3,
346 5,
347 7,
348 9,
349 11,
350 13,
351 15,
352 0,
353 2,
354 4,
355 6,
356 8,
357 10,
358 12,
359 14,
360 1,
361 3,
362 5,
363 7,
364 9,
365 11,
366 13,
367 15);
368 const __m128i shuffle_separate128 =
369 256 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
370
371
2/2
✓ Branch 0 taken 518 times.
✓ Branch 1 taken 256 times.
774 while (stage > 4) {
372 518 frame_ptr = frame;
373 518 temp_ptr = temp;
374
375 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
376
2/2
✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.
10246 for (branch = 0; branch < num_branches; ++branch) {
377
2/2
✓ Branch 0 taken 28864 times.
✓ Branch 1 taken 4736 times.
33600 for (bit = 0; bit < frame_half; bit += 32) {
378
2/2
✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 23872 times.
28864 if ((frame_half - bit) <
379 32) // if only 16 bits remaining in frame, not 32
380 {
381 4992 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
382 4992 temp_ptr += 16;
383 4992 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
384 4992 temp_ptr += 16;
385
386 4992 shifted2 = _mm_srli_si128(r_temp2, 1);
387 4992 shifted2 = _mm_and_si128(shifted2, mask_stage0);
388 4992 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
389 4992 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
390
391 4992 shifted2 = _mm_srli_si128(r_temp3, 1);
392 4992 shifted2 = _mm_and_si128(shifted2, mask_stage0);
393 4992 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
394 4992 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
395
396 4992 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
397 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
398
399 4992 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
400 4992 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
401 4992 frame_ptr += 16;
402 4992 break;
403 }
404 23872 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
405 23872 temp_ptr += 32;
406 23872 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
407 23872 temp_ptr += 32;
408
409 23872 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
410 23872 shifted = _mm256_and_si256(shifted, mask_stage1);
411 23872 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412 23872 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
413
414 23872 shifted = _mm256_srli_si256(r_temp1, 1);
415 23872 shifted = _mm256_and_si256(shifted, mask_stage1);
416 23872 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417 23872 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
418
419 23872 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420 23872 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421 23872 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422 23872 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
423
424 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
425
426 23872 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
427 23872 frame_ptr += 32;
428 }
429
430 9728 frame_ptr += frame_half;
431 }
432 518 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
433
434 518 num_branches = num_branches << 1;
435 518 frame_half = frame_half >> 1;
436 518 stage--;
437 }
438 }
439
440 // This last part requires at least 32-bit frames.
441 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
442
443 // reset pointers to correct positions.
444 256 frame_ptr = frame;
445 256 temp_ptr = temp;
446
447 // prefetch first chunk
448 256 __VOLK_PREFETCH(temp_ptr);
449
450 256 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
451 8,
452 4,
453 12,
454 2,
455 10,
456 6,
457 14,
458 1,
459 9,
460 5,
461 13,
462 3,
463 11,
464 7,
465 15,
466 0,
467 8,
468 4,
469 12,
470 2,
471 10,
472 6,
473 14,
474 1,
475 9,
476 5,
477 13,
478 3,
479 11,
480 7,
481 15);
482 256 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
483 0x0,
484 0x0,
485 0x0,
486 0x0,
487 0x0,
488 0x0,
489 0x0,
490 0xFF,
491 0xFF,
492 0xFF,
493 0xFF,
494 0xFF,
495 0xFF,
496 0xFF,
497 0xFF,
498 0x0,
499 0x0,
500 0x0,
501 0x0,
502 0x0,
503 0x0,
504 0x0,
505 0x0,
506 0xFF,
507 0xFF,
508 0xFF,
509 0xFF,
510 0xFF,
511 0xFF,
512 0xFF,
513 0xFF);
514 256 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
515 0x0,
516 0x0,
517 0x0,
518 0xFF,
519 0xFF,
520 0xFF,
521 0xFF,
522 0x0,
523 0x0,
524 0x0,
525 0x0,
526 0xFF,
527 0xFF,
528 0xFF,
529 0xFF,
530 0x0,
531 0x0,
532 0x0,
533 0x0,
534 0xFF,
535 0xFF,
536 0xFF,
537 0xFF,
538 0x0,
539 0x0,
540 0x0,
541 0x0,
542 0xFF,
543 0xFF,
544 0xFF,
545 0xFF);
546 256 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
547 0x0,
548 0xFF,
549 0xFF,
550 0x0,
551 0x0,
552 0xFF,
553 0xFF,
554 0x0,
555 0x0,
556 0xFF,
557 0xFF,
558 0x0,
559 0x0,
560 0xFF,
561 0xFF,
562 0x0,
563 0x0,
564 0xFF,
565 0xFF,
566 0x0,
567 0x0,
568 0xFF,
569 0xFF,
570 0x0,
571 0x0,
572 0xFF,
573 0xFF,
574 0x0,
575 0x0,
576 0xFF,
577 0xFF);
578
579
2/2
✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 256 times.
5248 for (branch = 0; branch < num_branches / 2; ++branch) {
580 4992 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
581
582 // prefetch next chunk
583 4992 temp_ptr += 32;
584 4992 __VOLK_PREFETCH(temp_ptr);
585
586 // shuffle once for bit-reversal.
587 4992 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
588
589 4992 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
590 4992 shifted = _mm256_and_si256(shifted, mask_stage4);
591 4992 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
592
593
594 4992 shifted = _mm256_srli_si256(r_frame0, 4);
595 4992 shifted = _mm256_and_si256(shifted, mask_stage3);
596 4992 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
597
598 4992 shifted = _mm256_srli_si256(r_frame0, 2);
599 4992 shifted = _mm256_and_si256(shifted, mask_stage2);
600 4992 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
601
602 4992 shifted = _mm256_srli_si256(r_frame0, 1);
603 4992 shifted = _mm256_and_si256(shifted, mask_stage1);
604 4992 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
605
606 // store result of chunk.
607 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
608 4992 frame_ptr += 32;
609 }
610 }
611 #endif /* LV_HAVE_AVX2 */
612
613 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
614
615 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
617
618 #ifdef LV_HAVE_SSSE3
619 #include <tmmintrin.h>
620
621 2 static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
622 unsigned char* temp,
623 unsigned int frame_size)
624 {
625
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (frame_size < 16) {
626 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
627 return;
628 }
629
630 2 const unsigned int po2 = log2_of_power_of_2(frame_size);
631
632 2 unsigned int stage = po2;
633 2 unsigned char* frame_ptr = frame;
634 2 unsigned char* temp_ptr = temp;
635
636 2 unsigned int frame_half = frame_size >> 1;
637 2 unsigned int num_branches = 1;
638 unsigned int branch;
639 unsigned int bit;
640
641 // prepare constants
642 2 const __m128i mask_stage1 = _mm_set_epi8(0x0,
643 0xFF,
644 0x0,
645 0xFF,
646 0x0,
647 0xFF,
648 0x0,
649 0xFF,
650 0x0,
651 0xFF,
652 0x0,
653 0xFF,
654 0x0,
655 0xFF,
656 0x0,
657 0xFF);
658
659 // get some SIMD registers to play with.
660 __m128i r_frame0, r_temp0, shifted;
661
662 {
663 __m128i r_frame1, r_temp1;
664 const __m128i shuffle_separate =
665 2 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
666
667
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
26 while (stage > 4) {
668 24 frame_ptr = frame;
669 24 temp_ptr = temp;
670
671 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
672
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.
8214 for (branch = 0; branch < num_branches; ++branch) {
673
2/2
✓ Branch 0 taken 49152 times.
✓ Branch 1 taken 8190 times.
57342 for (bit = 0; bit < frame_half; bit += 16) {
674 49152 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
675 49152 temp_ptr += 16;
676 49152 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
677 49152 temp_ptr += 16;
678
679 49152 shifted = _mm_srli_si128(r_temp0, 1);
680 49152 shifted = _mm_and_si128(shifted, mask_stage1);
681 49152 r_temp0 = _mm_xor_si128(shifted, r_temp0);
682 49152 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
683
684 49152 shifted = _mm_srli_si128(r_temp1, 1);
685 49152 shifted = _mm_and_si128(shifted, mask_stage1);
686 49152 r_temp1 = _mm_xor_si128(shifted, r_temp1);
687 49152 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
688
689 49152 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
690 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
691
692 49152 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
693 49152 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
694 49152 frame_ptr += 16;
695 }
696
697 8190 frame_ptr += frame_half;
698 }
699 24 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
700
701 24 num_branches = num_branches << 1;
702 24 frame_half = frame_half >> 1;
703 24 stage--;
704 }
705 }
706
707 // This last part requires at least 16-bit frames.
708 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
709
710 // reset pointers to correct positions.
711 2 frame_ptr = frame;
712 2 temp_ptr = temp;
713
714 // prefetch first chunk
715 2 __VOLK_PREFETCH(temp_ptr);
716
717 const __m128i shuffle_stage4 =
718 2 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
719 2 const __m128i mask_stage4 = _mm_set_epi8(0x0,
720 0x0,
721 0x0,
722 0x0,
723 0x0,
724 0x0,
725 0x0,
726 0x0,
727 0xFF,
728 0xFF,
729 0xFF,
730 0xFF,
731 0xFF,
732 0xFF,
733 0xFF,
734 0xFF);
735 2 const __m128i mask_stage3 = _mm_set_epi8(0x0,
736 0x0,
737 0x0,
738 0x0,
739 0xFF,
740 0xFF,
741 0xFF,
742 0xFF,
743 0x0,
744 0x0,
745 0x0,
746 0x0,
747 0xFF,
748 0xFF,
749 0xFF,
750 0xFF);
751 2 const __m128i mask_stage2 = _mm_set_epi8(0x0,
752 0x0,
753 0xFF,
754 0xFF,
755 0x0,
756 0x0,
757 0xFF,
758 0xFF,
759 0x0,
760 0x0,
761 0xFF,
762 0xFF,
763 0x0,
764 0x0,
765 0xFF,
766 0xFF);
767
768
2/2
✓ Branch 0 taken 8192 times.
✓ Branch 1 taken 2 times.
8194 for (branch = 0; branch < num_branches; ++branch) {
769 8192 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
770
771 // prefetch next chunk
772 8192 temp_ptr += 16;
773 8192 __VOLK_PREFETCH(temp_ptr);
774
775 // shuffle once for bit-reversal.
776 8192 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
777
778 8192 shifted = _mm_srli_si128(r_temp0, 8);
779 8192 shifted = _mm_and_si128(shifted, mask_stage4);
780 8192 r_frame0 = _mm_xor_si128(shifted, r_temp0);
781
782 8192 shifted = _mm_srli_si128(r_frame0, 4);
783 8192 shifted = _mm_and_si128(shifted, mask_stage3);
784 8192 r_frame0 = _mm_xor_si128(shifted, r_frame0);
785
786 8192 shifted = _mm_srli_si128(r_frame0, 2);
787 8192 shifted = _mm_and_si128(shifted, mask_stage2);
788 8192 r_frame0 = _mm_xor_si128(shifted, r_frame0);
789
790 8192 shifted = _mm_srli_si128(r_frame0, 1);
791 8192 shifted = _mm_and_si128(shifted, mask_stage1);
792 8192 r_frame0 = _mm_xor_si128(shifted, r_frame0);
793
794 // store result of chunk.
795 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
796 8192 frame_ptr += 16;
797 }
798 }
799 #endif /* LV_HAVE_SSSE3 */
800
801 #ifdef LV_HAVE_AVX2
802 #include <immintrin.h>
803
804 2 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
805 unsigned char* temp,
806 unsigned int frame_size)
807 {
808
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (frame_size < 32) {
809 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
810 return;
811 }
812
813 2 const unsigned int po2 = log2_of_power_of_2(frame_size);
814
815 2 unsigned int stage = po2;
816 2 unsigned char* frame_ptr = frame;
817 2 unsigned char* temp_ptr = temp;
818
819 2 unsigned int frame_half = frame_size >> 1;
820 2 unsigned int num_branches = 1;
821 unsigned int branch;
822 unsigned int bit;
823
824 // prepare constants
825 2 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
826 0xFF,
827 0x0,
828 0xFF,
829 0x0,
830 0xFF,
831 0x0,
832 0xFF,
833 0x0,
834 0xFF,
835 0x0,
836 0xFF,
837 0x0,
838 0xFF,
839 0x0,
840 0xFF,
841 0x0,
842 0xFF,
843 0x0,
844 0xFF,
845 0x0,
846 0xFF,
847 0x0,
848 0xFF,
849 0x0,
850 0xFF,
851 0x0,
852 0xFF,
853 0x0,
854 0xFF,
855 0x0,
856 0xFF);
857
858 2 const __m128i mask_stage0 = _mm_set_epi8(0x0,
859 0xFF,
860 0x0,
861 0xFF,
862 0x0,
863 0xFF,
864 0x0,
865 0xFF,
866 0x0,
867 0xFF,
868 0x0,
869 0xFF,
870 0x0,
871 0xFF,
872 0x0,
873 0xFF);
874 // get some SIMD registers to play with.
875 __m256i r_frame0, r_temp0, shifted;
876 __m128i r_temp2, r_frame2, shifted2;
877 {
878 __m256i r_frame1, r_temp1;
879 __m128i r_frame3, r_temp3;
880 2 const __m256i shuffle_separate = _mm256_setr_epi8(0,
881 2,
882 4,
883 6,
884 8,
885 10,
886 12,
887 14,
888 1,
889 3,
890 5,
891 7,
892 9,
893 11,
894 13,
895 15,
896 0,
897 2,
898 4,
899 6,
900 8,
901 10,
902 12,
903 14,
904 1,
905 3,
906 5,
907 7,
908 9,
909 11,
910 13,
911 15);
912 const __m128i shuffle_separate128 =
913 2 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
914
915
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
26 while (stage > 4) {
916 24 frame_ptr = frame;
917 24 temp_ptr = temp;
918
919 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
920
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.
8214 for (branch = 0; branch < num_branches; ++branch) {
921
2/2
✓ Branch 0 taken 26624 times.
✓ Branch 1 taken 4094 times.
30718 for (bit = 0; bit < frame_half; bit += 32) {
922
2/2
✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 22528 times.
26624 if ((frame_half - bit) <
923 32) // if only 16 bits remaining in frame, not 32
924 {
925 4096 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
926 4096 temp_ptr += 16;
927 4096 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
928 4096 temp_ptr += 16;
929
930 4096 shifted2 = _mm_srli_si128(r_temp2, 1);
931 4096 shifted2 = _mm_and_si128(shifted2, mask_stage0);
932 4096 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
933 4096 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
934
935 4096 shifted2 = _mm_srli_si128(r_temp3, 1);
936 4096 shifted2 = _mm_and_si128(shifted2, mask_stage0);
937 4096 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
938 4096 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
939
940 4096 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
941 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
942
943 4096 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
944 4096 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
945 4096 frame_ptr += 16;
946 4096 break;
947 }
948 22528 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
949 22528 temp_ptr += 32;
950 22528 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
951 22528 temp_ptr += 32;
952
953 22528 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
954 22528 shifted = _mm256_and_si256(shifted, mask_stage1);
955 22528 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
956 22528 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
957
958 22528 shifted = _mm256_srli_si256(r_temp1, 1);
959 22528 shifted = _mm256_and_si256(shifted, mask_stage1);
960 22528 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
961 22528 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
962
963 22528 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
964 22528 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
965 22528 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
966 22528 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
967
968 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
969
970 22528 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
971 22528 frame_ptr += 32;
972 }
973
974 8190 frame_ptr += frame_half;
975 }
976 24 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
977
978 24 num_branches = num_branches << 1;
979 24 frame_half = frame_half >> 1;
980 24 stage--;
981 }
982 }
983
984 // This last part requires at least 32-bit frames.
985 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
986
987 // reset pointers to correct positions.
988 2 frame_ptr = frame;
989 2 temp_ptr = temp;
990
991 // prefetch first chunk.
992 2 __VOLK_PREFETCH(temp_ptr);
993
994 2 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
995 8,
996 4,
997 12,
998 2,
999 10,
1000 6,
1001 14,
1002 1,
1003 9,
1004 5,
1005 13,
1006 3,
1007 11,
1008 7,
1009 15,
1010 0,
1011 8,
1012 4,
1013 12,
1014 2,
1015 10,
1016 6,
1017 14,
1018 1,
1019 9,
1020 5,
1021 13,
1022 3,
1023 11,
1024 7,
1025 15);
1026 2 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1027 0x0,
1028 0x0,
1029 0x0,
1030 0x0,
1031 0x0,
1032 0x0,
1033 0x0,
1034 0xFF,
1035 0xFF,
1036 0xFF,
1037 0xFF,
1038 0xFF,
1039 0xFF,
1040 0xFF,
1041 0xFF,
1042 0x0,
1043 0x0,
1044 0x0,
1045 0x0,
1046 0x0,
1047 0x0,
1048 0x0,
1049 0x0,
1050 0xFF,
1051 0xFF,
1052 0xFF,
1053 0xFF,
1054 0xFF,
1055 0xFF,
1056 0xFF,
1057 0xFF);
1058 2 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1059 0x0,
1060 0x0,
1061 0x0,
1062 0xFF,
1063 0xFF,
1064 0xFF,
1065 0xFF,
1066 0x0,
1067 0x0,
1068 0x0,
1069 0x0,
1070 0xFF,
1071 0xFF,
1072 0xFF,
1073 0xFF,
1074 0x0,
1075 0x0,
1076 0x0,
1077 0x0,
1078 0xFF,
1079 0xFF,
1080 0xFF,
1081 0xFF,
1082 0x0,
1083 0x0,
1084 0x0,
1085 0x0,
1086 0xFF,
1087 0xFF,
1088 0xFF,
1089 0xFF);
1090 2 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1091 0x0,
1092 0xFF,
1093 0xFF,
1094 0x0,
1095 0x0,
1096 0xFF,
1097 0xFF,
1098 0x0,
1099 0x0,
1100 0xFF,
1101 0xFF,
1102 0x0,
1103 0x0,
1104 0xFF,
1105 0xFF,
1106 0x0,
1107 0x0,
1108 0xFF,
1109 0xFF,
1110 0x0,
1111 0x0,
1112 0xFF,
1113 0xFF,
1114 0x0,
1115 0x0,
1116 0xFF,
1117 0xFF,
1118 0x0,
1119 0x0,
1120 0xFF,
1121 0xFF);
1122
1123
2/2
✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 2 times.
4098 for (branch = 0; branch < num_branches / 2; ++branch) {
1124 4096 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1125
1126 // prefetch next chunk
1127 4096 temp_ptr += 32;
1128 4096 __VOLK_PREFETCH(temp_ptr);
1129
1130 // shuffle once for bit-reversal.
1131 4096 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1132
1133 4096 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1134 4096 shifted = _mm256_and_si256(shifted, mask_stage4);
1135 4096 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1136
1137 4096 shifted = _mm256_srli_si256(r_frame0, 4);
1138 4096 shifted = _mm256_and_si256(shifted, mask_stage3);
1139 4096 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1140
1141 4096 shifted = _mm256_srli_si256(r_frame0, 2);
1142 4096 shifted = _mm256_and_si256(shifted, mask_stage2);
1143 4096 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1144
1145 4096 shifted = _mm256_srli_si256(r_frame0, 1);
1146 4096 shifted = _mm256_and_si256(shifted, mask_stage1);
1147 4096 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1148
1149 // store result of chunk.
1150 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1151 4096 frame_ptr += 32;
1152 }
1153 }
1154 #endif /* LV_HAVE_AVX2 */
1155
1156
1157 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
1158