GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_8u_x2_encodeframepolar_8u.h
Date:	2023-10-23 23:10:04
	Exec	Total	Coverage
Lines:	319	323	98.8%
Functions:	7	7	100.0%
Branches:	56	58	96.6%
  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2015 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*
    
       * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
    
       */
    
      #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
    
      #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
    
      #include <string.h>
    
      2072
      static inline unsigned int log2_of_power_of_2(unsigned int val)
    
      {
    
          // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
    
          static const unsigned int b[] = {
    
              0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
    
          };
    
      2072
          unsigned int res = (val & b[0]) != 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2056 times.

      2072
          res |= ((val & b[4]) != 0) << 4;
    
        2/2✓ Branch 0 taken 72 times.
✓ Branch 1 taken 2000 times.

      2072
          res |= ((val & b[3]) != 0) << 3;
    
        2/2✓ Branch 0 taken 972 times.
✓ Branch 1 taken 1100 times.

      2072
          res |= ((val & b[2]) != 0) << 2;
    
        2/2✓ Branch 0 taken 1228 times.
✓ Branch 1 taken 844 times.

      2072
          res |= ((val & b[1]) != 0) << 1;
    
      2072
          return res;
    
      }
    
      4200
      static inline void encodepolar_single_stage(unsigned char* frame_ptr,
    
                                                  const unsigned char* temp_ptr,
    
                                                  const unsigned int num_branches,
    
                                                  const unsigned int frame_half)
    
      {
    
          unsigned int branch, bit;
    
        2/2✓ Branch 0 taken 166648 times.
✓ Branch 1 taken 4200 times.

      170848
          for (branch = 0; branch < num_branches; ++branch) {
    
        2/2✓ Branch 0 taken 1216512 times.
✓ Branch 1 taken 166648 times.

      1383160
              for (bit = 0; bit < frame_half; ++bit) {
    
      1216512
                  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
    
      1216512
                  *(frame_ptr + frame_half) = *(temp_ptr + 1);
    
      1216512
                  ++frame_ptr;
    
      1216512
                  temp_ptr += 2;
    
              }
    
      166648
              frame_ptr += frame_half;
    
          }
    
      4200
      }
    
      #ifdef LV_HAVE_GENERIC
    
      1288
      static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
    
                                                                unsigned char* temp,
    
                                                                unsigned int frame_size)
    
      {
    
      1288
          unsigned int stage = log2_of_power_of_2(frame_size);
    
      1288
          unsigned int frame_half = frame_size >> 1;
    
      1288
          unsigned int num_branches = 1;
    
        2/2✓ Branch 0 taken 4200 times.
✓ Branch 1 taken 1288 times.

      5488
          while (stage) {
    
              // encode stage
    
      4200
              encodepolar_single_stage(frame, temp, num_branches, frame_half);
    
      4200
              memcpy(temp, frame, sizeof(unsigned char) * frame_size);
    
              // update all the parameters.
    
      4200
              num_branches = num_branches << 1;
    
      4200
              frame_half = frame_half >> 1;
    
      4200
              --stage;
    
          }
    
      1288
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_SSSE3
    
      #include <tmmintrin.h>
    
      1024
      static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
    
                                                                unsigned char* temp,
    
                                                                unsigned int frame_size)
    
      {
    
        2/2✓ Branch 0 taken 512 times.
✓ Branch 1 taken 512 times.

      1024
          if (frame_size < 16) {
    
      512
              volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
    
      512
              return;
    
          }
    
      512
          const unsigned int po2 = log2_of_power_of_2(frame_size);
    
      512
          unsigned int stage = po2;
    
      512
          unsigned char* frame_ptr = frame;
    
      512
          unsigned char* temp_ptr = temp;
    
      512
          unsigned int frame_half = frame_size >> 1;
    
      512
          unsigned int num_branches = 1;
    
          unsigned int branch;
    
          unsigned int bit;
    
          // prepare constants
    
      512
          const __m128i mask_stage1 = _mm_set_epi8(0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF);
    
          // get some SIMD registers to play with.
    
          __m128i r_frame0, r_temp0, shifted;
    
          {
    
              __m128i r_frame1, r_temp1;
    
              const __m128i shuffle_separate =
    
      512
                  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
    
        2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 512 times.

      1030
              while (stage > 4) {
    
      518
                  frame_ptr = frame;
    
      518
                  temp_ptr = temp;
    
                  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
    
        2/2✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.

      10246
                  for (branch = 0; branch < num_branches; ++branch) {
    
        2/2✓ Branch 0 taken 52736 times.
✓ Branch 1 taken 9728 times.

      62464
                      for (bit = 0; bit < frame_half; bit += 16) {
    
      52736
                          r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
    
      52736
                          temp_ptr += 16;
    
      52736
                          r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
    
      52736
                          temp_ptr += 16;
    
      52736
                          shifted = _mm_srli_si128(r_temp0, 1);
    
      52736
                          shifted = _mm_and_si128(shifted, mask_stage1);
    
      52736
                          r_temp0 = _mm_xor_si128(shifted, r_temp0);
    
      52736
                          r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
    
      52736
                          shifted = _mm_srli_si128(r_temp1, 1);
    
      52736
                          shifted = _mm_and_si128(shifted, mask_stage1);
    
      52736
                          r_temp1 = _mm_xor_si128(shifted, r_temp1);
    
      52736
                          r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
    
      52736
                          r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
    
                          _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
    
      52736
                          r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
    
      52736
                          _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
    
      52736
                          frame_ptr += 16;
    
                      }
    
      9728
                      frame_ptr += frame_half;
    
                  }
    
      518
                  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
    
      518
                  num_branches = num_branches << 1;
    
      518
                  frame_half = frame_half >> 1;
    
      518
                  stage--;
    
              }
    
          }
    
          // This last part requires at least 16-bit frames.
    
          // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
    
          // reset pointers to correct positions.
    
      512
          frame_ptr = frame;
    
      512
          temp_ptr = temp;
    
          // prefetch first chunk
    
      512
          __VOLK_PREFETCH(temp_ptr);
    
          const __m128i shuffle_stage4 =
    
      512
              _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
    
      512
          const __m128i mask_stage4 = _mm_set_epi8(0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF);
    
      512
          const __m128i mask_stage3 = _mm_set_epi8(0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF);
    
      512
          const __m128i mask_stage2 = _mm_set_epi8(0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF);
    
        2/2✓ Branch 0 taken 10240 times.
✓ Branch 1 taken 512 times.

      10752
          for (branch = 0; branch < num_branches; ++branch) {
    
      10240
              r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
    
              // prefetch next chunk
    
      10240
              temp_ptr += 16;
    
      10240
              __VOLK_PREFETCH(temp_ptr);
    
              // shuffle once for bit-reversal.
    
      10240
              r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
    
      10240
              shifted = _mm_srli_si128(r_temp0, 8);
    
      10240
              shifted = _mm_and_si128(shifted, mask_stage4);
    
      10240
              r_frame0 = _mm_xor_si128(shifted, r_temp0);
    
      10240
              shifted = _mm_srli_si128(r_frame0, 4);
    
      10240
              shifted = _mm_and_si128(shifted, mask_stage3);
    
      10240
              r_frame0 = _mm_xor_si128(shifted, r_frame0);
    
      10240
              shifted = _mm_srli_si128(r_frame0, 2);
    
      10240
              shifted = _mm_and_si128(shifted, mask_stage2);
    
      10240
              r_frame0 = _mm_xor_si128(shifted, r_frame0);
    
      10240
              shifted = _mm_srli_si128(r_frame0, 1);
    
      10240
              shifted = _mm_and_si128(shifted, mask_stage1);
    
      10240
              r_frame0 = _mm_xor_si128(shifted, r_frame0);
    
              // store result of chunk.
    
              _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
    
      10240
              frame_ptr += 16;
    
          }
    
      }
    
      #endif /* LV_HAVE_SSSE3 */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      1024
      static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
    
                                                               unsigned char* temp,
    
                                                               unsigned int frame_size)
    
      {
    
        2/2✓ Branch 0 taken 768 times.
✓ Branch 1 taken 256 times.

      1024
          if (frame_size < 32) {
    
      768
              volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
    
      768
              return;
    
          }
    
      256
          const unsigned int po2 = log2_of_power_of_2(frame_size);
    
      256
          unsigned int stage = po2;
    
      256
          unsigned char* frame_ptr = frame;
    
      256
          unsigned char* temp_ptr = temp;
    
      256
          unsigned int frame_half = frame_size >> 1;
    
      256
          unsigned int num_branches = 1;
    
          unsigned int branch;
    
          unsigned int bit;
    
          // prepare constants
    
      256
          const __m256i mask_stage1 = _mm256_set_epi8(0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF);
    
      256
          const __m128i mask_stage0 = _mm_set_epi8(0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF);
    
          // get some SIMD registers to play with.
    
          __m256i r_frame0, r_temp0, shifted;
    
          __m128i r_temp2, r_frame2, shifted2;
    
          {
    
              __m256i r_frame1, r_temp1;
    
              __m128i r_frame3, r_temp3;
    
      256
              const __m256i shuffle_separate = _mm256_setr_epi8(0,
    
                                                                2,
    
                                                                4,
    
                                                                6,
    
                                                                8,
    
                                                                10,
    
                                                                12,
    
                                                                14,
    
                                                                1,
    
                                                                3,
    
                                                                5,
    
                                                                7,
    
                                                                9,
    
                                                                11,
    
                                                                13,
    
                                                                15,
    
                                                                0,
    
                                                                2,
    
                                                                4,
    
                                                                6,
    
                                                                8,
    
                                                                10,
    
                                                                12,
    
                                                                14,
    
                                                                1,
    
                                                                3,
    
                                                                5,
    
                                                                7,
    
                                                                9,
    
                                                                11,
    
                                                                13,
    
                                                                15);
    
              const __m128i shuffle_separate128 =
    
      256
                  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
    
        2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 256 times.

      774
              while (stage > 4) {
    
      518
                  frame_ptr = frame;
    
      518
                  temp_ptr = temp;
    
                  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
    
        2/2✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.

      10246
                  for (branch = 0; branch < num_branches; ++branch) {
    
        2/2✓ Branch 0 taken 28864 times.
✓ Branch 1 taken 4736 times.

      33600
                      for (bit = 0; bit < frame_half; bit += 32) {
    
        2/2✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 23872 times.

      28864
                          if ((frame_half - bit) <
    
                              32) // if only 16 bits remaining in frame, not 32
    
                          {
    
      4992
                              r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
    
      4992
                              temp_ptr += 16;
    
      4992
                              r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
    
      4992
                              temp_ptr += 16;
    
      4992
                              shifted2 = _mm_srli_si128(r_temp2, 1);
    
      4992
                              shifted2 = _mm_and_si128(shifted2, mask_stage0);
    
      4992
                              r_temp2 = _mm_xor_si128(shifted2, r_temp2);
    
      4992
                              r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
    
      4992
                              shifted2 = _mm_srli_si128(r_temp3, 1);
    
      4992
                              shifted2 = _mm_and_si128(shifted2, mask_stage0);
    
      4992
                              r_temp3 = _mm_xor_si128(shifted2, r_temp3);
    
      4992
                              r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
    
      4992
                              r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
    
                              _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
    
      4992
                              r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
    
      4992
                              _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
    
      4992
                              frame_ptr += 16;
    
      4992
                              break;
    
                          }
    
      23872
                          r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
    
      23872
                          temp_ptr += 32;
    
      23872
                          r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
    
      23872
                          temp_ptr += 32;
    
      23872
                          shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
    
      23872
                          shifted = _mm256_and_si256(shifted, mask_stage1);
    
      23872
                          r_temp0 = _mm256_xor_si256(shifted, r_temp0);
    
      23872
                          r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
    
      23872
                          shifted = _mm256_srli_si256(r_temp1, 1);
    
      23872
                          shifted = _mm256_and_si256(shifted, mask_stage1);
    
      23872
                          r_temp1 = _mm256_xor_si256(shifted, r_temp1);
    
      23872
                          r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
    
      23872
                          r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
    
      23872
                          r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
    
      23872
                          r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
    
      23872
                          r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
    
                          _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
    
      23872
                          _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
    
      23872
                          frame_ptr += 32;
    
                      }
    
      9728
                      frame_ptr += frame_half;
    
                  }
    
      518
                  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
    
      518
                  num_branches = num_branches << 1;
    
      518
                  frame_half = frame_half >> 1;
    
      518
                  stage--;
    
              }
    
          }
    
          // This last part requires at least 32-bit frames.
    
          // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
    
          // reset pointers to correct positions.
    
      256
          frame_ptr = frame;
    
      256
          temp_ptr = temp;
    
          // prefetch first chunk
    
      256
          __VOLK_PREFETCH(temp_ptr);
    
      256
          const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
    
                                                          8,
    
                                                          4,
    
                                                          12,
    
                                                          2,
    
                                                          10,
    
                                                          6,
    
                                                          14,
    
                                                          1,
    
                                                          9,
    
                                                          5,
    
                                                          13,
    
                                                          3,
    
                                                          11,
    
                                                          7,
    
                                                          15,
    
                                                          0,
    
                                                          8,
    
                                                          4,
    
                                                          12,
    
                                                          2,
    
                                                          10,
    
                                                          6,
    
                                                          14,
    
                                                          1,
    
                                                          9,
    
                                                          5,
    
                                                          13,
    
                                                          3,
    
                                                          11,
    
                                                          7,
    
                                                          15);
    
      256
          const __m256i mask_stage4 = _mm256_set_epi8(0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF);
    
      256
          const __m256i mask_stage3 = _mm256_set_epi8(0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF);
    
      256
          const __m256i mask_stage2 = _mm256_set_epi8(0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF);
    
        2/2✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 256 times.

      5248
          for (branch = 0; branch < num_branches / 2; ++branch) {
    
      4992
              r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
    
              // prefetch next chunk
    
      4992
              temp_ptr += 32;
    
      4992
              __VOLK_PREFETCH(temp_ptr);
    
              // shuffle once for bit-reversal.
    
      4992
              r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
    
      4992
              shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
    
      4992
              shifted = _mm256_and_si256(shifted, mask_stage4);
    
      4992
              r_frame0 = _mm256_xor_si256(shifted, r_temp0);
    
      4992
              shifted = _mm256_srli_si256(r_frame0, 4);
    
      4992
              shifted = _mm256_and_si256(shifted, mask_stage3);
    
      4992
              r_frame0 = _mm256_xor_si256(shifted, r_frame0);
    
      4992
              shifted = _mm256_srli_si256(r_frame0, 2);
    
      4992
              shifted = _mm256_and_si256(shifted, mask_stage2);
    
      4992
              r_frame0 = _mm256_xor_si256(shifted, r_frame0);
    
      4992
              shifted = _mm256_srli_si256(r_frame0, 1);
    
      4992
              shifted = _mm256_and_si256(shifted, mask_stage1);
    
      4992
              r_frame0 = _mm256_xor_si256(shifted, r_frame0);
    
              // store result of chunk.
    
              _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
    
      4992
              frame_ptr += 32;
    
          }
    
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
    
      #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
    
      #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
    
      #ifdef LV_HAVE_SSSE3
    
      #include <tmmintrin.h>
    
      2
      static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
    
                                                                unsigned char* temp,
    
                                                                unsigned int frame_size)
    
      {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (frame_size < 16) {
    
      ✗
              volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
    
      ✗
              return;
    
          }
    
      2
          const unsigned int po2 = log2_of_power_of_2(frame_size);
    
      2
          unsigned int stage = po2;
    
      2
          unsigned char* frame_ptr = frame;
    
      2
          unsigned char* temp_ptr = temp;
    
      2
          unsigned int frame_half = frame_size >> 1;
    
      2
          unsigned int num_branches = 1;
    
          unsigned int branch;
    
          unsigned int bit;
    
          // prepare constants
    
      2
          const __m128i mask_stage1 = _mm_set_epi8(0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF);
    
          // get some SIMD registers to play with.
    
          __m128i r_frame0, r_temp0, shifted;
    
          {
    
              __m128i r_frame1, r_temp1;
    
              const __m128i shuffle_separate =
    
      2
                  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
    
        2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.

      26
              while (stage > 4) {
    
      24
                  frame_ptr = frame;
    
      24
                  temp_ptr = temp;
    
                  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
    
        2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.

      8214
                  for (branch = 0; branch < num_branches; ++branch) {
    
        2/2✓ Branch 0 taken 49152 times.
✓ Branch 1 taken 8190 times.

      57342
                      for (bit = 0; bit < frame_half; bit += 16) {
    
      49152
                          r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
    
      49152
                          temp_ptr += 16;
    
      49152
                          r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
    
      49152
                          temp_ptr += 16;
    
      49152
                          shifted = _mm_srli_si128(r_temp0, 1);
    
      49152
                          shifted = _mm_and_si128(shifted, mask_stage1);
    
      49152
                          r_temp0 = _mm_xor_si128(shifted, r_temp0);
    
      49152
                          r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
    
      49152
                          shifted = _mm_srli_si128(r_temp1, 1);
    
      49152
                          shifted = _mm_and_si128(shifted, mask_stage1);
    
      49152
                          r_temp1 = _mm_xor_si128(shifted, r_temp1);
    
      49152
                          r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
    
      49152
                          r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
    
                          _mm_store_si128((__m128i*)frame_ptr, r_frame0);
    
      49152
                          r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
    
      49152
                          _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
    
      49152
                          frame_ptr += 16;
    
                      }
    
      8190
                      frame_ptr += frame_half;
    
                  }
    
      24
                  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
    
      24
                  num_branches = num_branches << 1;
    
      24
                  frame_half = frame_half >> 1;
    
      24
                  stage--;
    
              }
    
          }
    
          // This last part requires at least 16-bit frames.
    
          // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
    
          // reset pointers to correct positions.
    
      2
          frame_ptr = frame;
    
      2
          temp_ptr = temp;
    
          // prefetch first chunk
    
      2
          __VOLK_PREFETCH(temp_ptr);
    
          const __m128i shuffle_stage4 =
    
      2
              _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
    
      2
          const __m128i mask_stage4 = _mm_set_epi8(0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF);
    
      2
          const __m128i mask_stage3 = _mm_set_epi8(0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0xFF);
    
      2
          const __m128i mask_stage2 = _mm_set_epi8(0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0xFF);
    
        2/2✓ Branch 0 taken 8192 times.
✓ Branch 1 taken 2 times.

      8194
          for (branch = 0; branch < num_branches; ++branch) {
    
      8192
              r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
    
              // prefetch next chunk
    
      8192
              temp_ptr += 16;
    
      8192
              __VOLK_PREFETCH(temp_ptr);
    
              // shuffle once for bit-reversal.
    
      8192
              r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
    
      8192
              shifted = _mm_srli_si128(r_temp0, 8);
    
      8192
              shifted = _mm_and_si128(shifted, mask_stage4);
    
      8192
              r_frame0 = _mm_xor_si128(shifted, r_temp0);
    
      8192
              shifted = _mm_srli_si128(r_frame0, 4);
    
      8192
              shifted = _mm_and_si128(shifted, mask_stage3);
    
      8192
              r_frame0 = _mm_xor_si128(shifted, r_frame0);
    
      8192
              shifted = _mm_srli_si128(r_frame0, 2);
    
      8192
              shifted = _mm_and_si128(shifted, mask_stage2);
    
      8192
              r_frame0 = _mm_xor_si128(shifted, r_frame0);
    
      8192
              shifted = _mm_srli_si128(r_frame0, 1);
    
      8192
              shifted = _mm_and_si128(shifted, mask_stage1);
    
      8192
              r_frame0 = _mm_xor_si128(shifted, r_frame0);
    
              // store result of chunk.
    
              _mm_store_si128((__m128i*)frame_ptr, r_frame0);
    
      8192
              frame_ptr += 16;
    
          }
    
      }
    
      #endif /* LV_HAVE_SSSE3 */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
    
                                                               unsigned char* temp,
    
                                                               unsigned int frame_size)
    
      {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          if (frame_size < 32) {
    
      ✗
              volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
    
      ✗
              return;
    
          }
    
      2
          const unsigned int po2 = log2_of_power_of_2(frame_size);
    
      2
          unsigned int stage = po2;
    
      2
          unsigned char* frame_ptr = frame;
    
      2
          unsigned char* temp_ptr = temp;
    
      2
          unsigned int frame_half = frame_size >> 1;
    
      2
          unsigned int num_branches = 1;
    
          unsigned int branch;
    
          unsigned int bit;
    
          // prepare constants
    
      2
          const __m256i mask_stage1 = _mm256_set_epi8(0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0xFF);
    
      2
          const __m128i mask_stage0 = _mm_set_epi8(0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF,
    
                                                   0x0,
    
                                                   0xFF);
    
          // get some SIMD registers to play with.
    
          __m256i r_frame0, r_temp0, shifted;
    
          __m128i r_temp2, r_frame2, shifted2;
    
          {
    
              __m256i r_frame1, r_temp1;
    
              __m128i r_frame3, r_temp3;
    
      2
              const __m256i shuffle_separate = _mm256_setr_epi8(0,
    
                                                                2,
    
                                                                4,
    
                                                                6,
    
                                                                8,
    
                                                                10,
    
                                                                12,
    
                                                                14,
    
                                                                1,
    
                                                                3,
    
                                                                5,
    
                                                                7,
    
                                                                9,
    
                                                                11,
    
                                                                13,
    
                                                                15,
    
                                                                0,
    
                                                                2,
    
                                                                4,
    
                                                                6,
    
                                                                8,
    
                                                                10,
    
                                                                12,
    
                                                                14,
    
                                                                1,
    
                                                                3,
    
                                                                5,
    
                                                                7,
    
                                                                9,
    
                                                                11,
    
                                                                13,
    
                                                                15);
    
              const __m128i shuffle_separate128 =
    
      2
                  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
    
        2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.

      26
              while (stage > 4) {
    
      24
                  frame_ptr = frame;
    
      24
                  temp_ptr = temp;
    
                  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
    
        2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.

      8214
                  for (branch = 0; branch < num_branches; ++branch) {
    
        2/2✓ Branch 0 taken 26624 times.
✓ Branch 1 taken 4094 times.

      30718
                      for (bit = 0; bit < frame_half; bit += 32) {
    
        2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 22528 times.

      26624
                          if ((frame_half - bit) <
    
                              32) // if only 16 bits remaining in frame, not 32
    
                          {
    
      4096
                              r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
    
      4096
                              temp_ptr += 16;
    
      4096
                              r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
    
      4096
                              temp_ptr += 16;
    
      4096
                              shifted2 = _mm_srli_si128(r_temp2, 1);
    
      4096
                              shifted2 = _mm_and_si128(shifted2, mask_stage0);
    
      4096
                              r_temp2 = _mm_xor_si128(shifted2, r_temp2);
    
      4096
                              r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
    
      4096
                              shifted2 = _mm_srli_si128(r_temp3, 1);
    
      4096
                              shifted2 = _mm_and_si128(shifted2, mask_stage0);
    
      4096
                              r_temp3 = _mm_xor_si128(shifted2, r_temp3);
    
      4096
                              r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
    
      4096
                              r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
    
                              _mm_store_si128((__m128i*)frame_ptr, r_frame2);
    
      4096
                              r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
    
      4096
                              _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
    
      4096
                              frame_ptr += 16;
    
      4096
                              break;
    
                          }
    
      22528
                          r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
    
      22528
                          temp_ptr += 32;
    
      22528
                          r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
    
      22528
                          temp_ptr += 32;
    
      22528
                          shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
    
      22528
                          shifted = _mm256_and_si256(shifted, mask_stage1);
    
      22528
                          r_temp0 = _mm256_xor_si256(shifted, r_temp0);
    
      22528
                          r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
    
      22528
                          shifted = _mm256_srli_si256(r_temp1, 1);
    
      22528
                          shifted = _mm256_and_si256(shifted, mask_stage1);
    
      22528
                          r_temp1 = _mm256_xor_si256(shifted, r_temp1);
    
      22528
                          r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
    
      22528
                          r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
    
      22528
                          r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
    
      22528
                          r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
    
      22528
                          r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
    
                          _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
    
      22528
                          _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
    
      22528
                          frame_ptr += 32;
    
                      }
    
      8190
                      frame_ptr += frame_half;
    
                  }
    
      24
                  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
    
      24
                  num_branches = num_branches << 1;
    
      24
                  frame_half = frame_half >> 1;
    
      24
                  stage--;
    
              }
    
          }
    
          // This last part requires at least 32-bit frames.
    
          // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
    
          // reset pointers to correct positions.
    
      2
          frame_ptr = frame;
    
      2
          temp_ptr = temp;
    
          // prefetch first chunk.
    
      2
          __VOLK_PREFETCH(temp_ptr);
    
      2
          const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
    
                                                          8,
    
                                                          4,
    
                                                          12,
    
                                                          2,
    
                                                          10,
    
                                                          6,
    
                                                          14,
    
                                                          1,
    
                                                          9,
    
                                                          5,
    
                                                          13,
    
                                                          3,
    
                                                          11,
    
                                                          7,
    
                                                          15,
    
                                                          0,
    
                                                          8,
    
                                                          4,
    
                                                          12,
    
                                                          2,
    
                                                          10,
    
                                                          6,
    
                                                          14,
    
                                                          1,
    
                                                          9,
    
                                                          5,
    
                                                          13,
    
                                                          3,
    
                                                          11,
    
                                                          7,
    
                                                          15);
    
      2
          const __m256i mask_stage4 = _mm256_set_epi8(0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF);
    
      2
          const __m256i mask_stage3 = _mm256_set_epi8(0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0xFF);
    
      2
          const __m256i mask_stage2 = _mm256_set_epi8(0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF,
    
                                                      0x0,
    
                                                      0x0,
    
                                                      0xFF,
    
                                                      0xFF);
    
        2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 2 times.

      4098
          for (branch = 0; branch < num_branches / 2; ++branch) {
    
      4096
              r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
    
              // prefetch next chunk
    
      4096
              temp_ptr += 32;
    
      4096
              __VOLK_PREFETCH(temp_ptr);
    
              // shuffle once for bit-reversal.
    
      4096
              r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
    
      4096
              shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
    
      4096
              shifted = _mm256_and_si256(shifted, mask_stage4);
    
      4096
              r_frame0 = _mm256_xor_si256(shifted, r_temp0);
    
      4096
              shifted = _mm256_srli_si256(r_frame0, 4);
    
      4096
              shifted = _mm256_and_si256(shifted, mask_stage3);
    
      4096
              r_frame0 = _mm256_xor_si256(shifted, r_frame0);
    
      4096
              shifted = _mm256_srli_si256(r_frame0, 2);
    
      4096
              shifted = _mm256_and_si256(shifted, mask_stage2);
    
      4096
              r_frame0 = _mm256_xor_si256(shifted, r_frame0);
    
      4096
              shifted = _mm256_srli_si256(r_frame0, 1);
    
      4096
              shifted = _mm256_and_si256(shifted, mask_stage1);
    
      4096
              r_frame0 = _mm256_xor_si256(shifted, r_frame0);
    
              // store result of chunk.
    
              _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
    
      4096
              frame_ptr += 32;
    
          }
    
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */