GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32u_reverse_32u.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	154	154	100.0%
Functions:	7	7	100.0%
Branches:	14	14	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2018 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32u_reverse_32u
    
       *
    
       * \b bit reversal of the input 32 bit word
    
       * <b>Dispatcher Prototype</b>
    
       * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int
    
       num_points);
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li inputVector: The input vector
    
       * \li num_points The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The vector where the results will be stored, which is the
    
       bit-reversed input
    
       *
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
    
      struct dword_split {
    
          int b00 : 1;
    
          int b01 : 1;
    
          int b02 : 1;
    
          int b03 : 1;
    
          int b04 : 1;
    
          int b05 : 1;
    
          int b06 : 1;
    
          int b07 : 1;
    
          int b08 : 1;
    
          int b09 : 1;
    
          int b10 : 1;
    
          int b11 : 1;
    
          int b12 : 1;
    
          int b13 : 1;
    
          int b14 : 1;
    
          int b15 : 1;
    
          int b16 : 1;
    
          int b17 : 1;
    
          int b18 : 1;
    
          int b19 : 1;
    
          int b20 : 1;
    
          int b21 : 1;
    
          int b22 : 1;
    
          int b23 : 1;
    
          int b24 : 1;
    
          int b25 : 1;
    
          int b26 : 1;
    
          int b27 : 1;
    
          int b28 : 1;
    
          int b29 : 1;
    
          int b30 : 1;
    
          int b31 : 1;
    
      };
    
      struct char_split {
    
          uint8_t b00 : 1;
    
          uint8_t b01 : 1;
    
          uint8_t b02 : 1;
    
          uint8_t b03 : 1;
    
          uint8_t b04 : 1;
    
          uint8_t b05 : 1;
    
          uint8_t b06 : 1;
    
          uint8_t b07 : 1;
    
      };
    
      // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
    
      // http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
    
      static const unsigned char BitReverseTable256[] = {
    
          0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
    
          0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
    
          0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
    
          0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
    
          0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
    
          0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
    
          0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
    
          0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
    
          0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
    
          0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
    
          0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
    
          0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
    
          0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
    
          0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
    
          0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
    
          0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
    
          0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
    
          0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
    
          0x3F, 0xBF, 0x7F, 0xFF
    
      };
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
    
                                                            const uint32_t* in,
    
                                                            unsigned int num_points)
    
      {
    
      2
          const struct dword_split* in_ptr = (const struct dword_split*)in;
    
      2
          struct dword_split* out_ptr = (struct dword_split*)out;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              out_ptr->b00 = in_ptr->b31;
    
      262142
              out_ptr->b01 = in_ptr->b30;
    
      262142
              out_ptr->b02 = in_ptr->b29;
    
      262142
              out_ptr->b03 = in_ptr->b28;
    
      262142
              out_ptr->b04 = in_ptr->b27;
    
      262142
              out_ptr->b05 = in_ptr->b26;
    
      262142
              out_ptr->b06 = in_ptr->b25;
    
      262142
              out_ptr->b07 = in_ptr->b24;
    
      262142
              out_ptr->b08 = in_ptr->b23;
    
      262142
              out_ptr->b09 = in_ptr->b22;
    
      262142
              out_ptr->b10 = in_ptr->b21;
    
      262142
              out_ptr->b11 = in_ptr->b20;
    
      262142
              out_ptr->b12 = in_ptr->b19;
    
      262142
              out_ptr->b13 = in_ptr->b18;
    
      262142
              out_ptr->b14 = in_ptr->b17;
    
      262142
              out_ptr->b15 = in_ptr->b16;
    
      262142
              out_ptr->b16 = in_ptr->b15;
    
      262142
              out_ptr->b17 = in_ptr->b14;
    
      262142
              out_ptr->b18 = in_ptr->b13;
    
      262142
              out_ptr->b19 = in_ptr->b12;
    
      262142
              out_ptr->b20 = in_ptr->b11;
    
      262142
              out_ptr->b21 = in_ptr->b10;
    
      262142
              out_ptr->b22 = in_ptr->b09;
    
      262142
              out_ptr->b23 = in_ptr->b08;
    
      262142
              out_ptr->b24 = in_ptr->b07;
    
      262142
              out_ptr->b25 = in_ptr->b06;
    
      262142
              out_ptr->b26 = in_ptr->b05;
    
      262142
              out_ptr->b27 = in_ptr->b04;
    
      262142
              out_ptr->b28 = in_ptr->b03;
    
      262142
              out_ptr->b29 = in_ptr->b02;
    
      262142
              out_ptr->b30 = in_ptr->b01;
    
      262142
              out_ptr->b31 = in_ptr->b00;
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
    
                                                           const uint32_t* in,
    
                                                           unsigned int num_points)
    
      {
    
      2
          const uint32_t* in_ptr = in;
    
      2
          uint32_t* out_ptr = out;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              const struct char_split* in8 = (const struct char_split*)in_ptr;
    
      262142
              struct char_split* out8 = (struct char_split*)out_ptr;
    
      262142
              out8[3].b00 = in8[0].b07;
    
      262142
              out8[3].b01 = in8[0].b06;
    
      262142
              out8[3].b02 = in8[0].b05;
    
      262142
              out8[3].b03 = in8[0].b04;
    
      262142
              out8[3].b04 = in8[0].b03;
    
      262142
              out8[3].b05 = in8[0].b02;
    
      262142
              out8[3].b06 = in8[0].b01;
    
      262142
              out8[3].b07 = in8[0].b00;
    
      262142
              out8[2].b00 = in8[1].b07;
    
      262142
              out8[2].b01 = in8[1].b06;
    
      262142
              out8[2].b02 = in8[1].b05;
    
      262142
              out8[2].b03 = in8[1].b04;
    
      262142
              out8[2].b04 = in8[1].b03;
    
      262142
              out8[2].b05 = in8[1].b02;
    
      262142
              out8[2].b06 = in8[1].b01;
    
      262142
              out8[2].b07 = in8[1].b00;
    
      262142
              out8[1].b00 = in8[2].b07;
    
      262142
              out8[1].b01 = in8[2].b06;
    
      262142
              out8[1].b02 = in8[2].b05;
    
      262142
              out8[1].b03 = in8[2].b04;
    
      262142
              out8[1].b04 = in8[2].b03;
    
      262142
              out8[1].b05 = in8[2].b02;
    
      262142
              out8[1].b06 = in8[2].b01;
    
      262142
              out8[1].b07 = in8[2].b00;
    
      262142
              out8[0].b00 = in8[3].b07;
    
      262142
              out8[0].b01 = in8[3].b06;
    
      262142
              out8[0].b02 = in8[3].b05;
    
      262142
              out8[0].b03 = in8[3].b04;
    
      262142
              out8[0].b04 = in8[3].b03;
    
      262142
              out8[0].b05 = in8[3].b02;
    
      262142
              out8[0].b06 = in8[3].b01;
    
      262142
              out8[0].b07 = in8[3].b00;
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
    
      // http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
    
      {
    
      2
          const uint32_t* in_ptr = in;
    
      2
          uint32_t* out_ptr = out;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) |
    
      262142
                         (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
    
      262142
                         (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
    
      262142
                         (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      // Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
    
      // domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
    
      {
    
      2
          const uint32_t* in_ptr = in;
    
      2
          uint32_t* out_ptr = out;
    
          const uint8_t* in8;
    
          uint8_t* out8;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              in8 = (const uint8_t*)in_ptr;
    
      262142
              out8 = (uint8_t*)out_ptr;
    
      262142
              out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
    
      262142
              out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
    
      262142
              out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
    
      262142
              out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_GENERIC
    
      // Current gr-pager implementation
    
      static inline void
    
      2
      volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
    
      {
    
      2
          const uint32_t* in_ptr = in;
    
      2
          uint32_t* out_ptr = out;
    
          const uint8_t* in8;
    
          uint8_t* out8;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              in8 = (const uint8_t*)in_ptr;
    
      262142
              out8 = (uint8_t*)out_ptr;
    
      262142
              out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
    
      262142
              out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
    
      262142
              out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
    
      262142
              out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      // After lengthy thought and quite a bit of whiteboarding:
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
    
                                                                       const uint32_t* in,
    
                                                                       unsigned int num_points)
    
      {
    
      2
          const uint32_t* in_ptr = in;
    
      2
          uint32_t* out_ptr = out;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              uint32_t tmp = *in_ptr;
    
              /* permute uint16:
    
                 The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
    
               */
    
      262142
              tmp = (tmp << 16) | (tmp >> 16);
    
              /* permute bytes:
    
                 shift up by 1 B first, then only consider even bytes, and OR with the unshifted
    
                 even bytes
    
               */
    
      262142
              tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
    
              /* permute 4bit tuples:
    
                 Same idea, but the "consideration" mask expression becomes unwieldy
    
               */
    
      262142
              tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
    
      262142
                    ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
    
              /* permute 2bit tuples:
    
                 Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
    
                 3; we need those every 4b, which coincides with a hex digit!
    
              */
    
      262142
              tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
    
              /* permute odd/even:
    
                 0x01 = 0x1;  we need these every 2b, which works out: 0x01 | (0x01 << 2) =
    
                 0x05!
    
               */
    
      262142
              tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
    
      262142
              *out_ptr = tmp;
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
    
                                                                        const uint32_t* in,
    
                                                                        unsigned int num_points)
    
      {
    
          // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
    
      2
          const uint32_t* in_ptr = in;
    
      2
          uint32_t* out_ptr = out;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; ++number) {
    
      262142
              uint32_t tmp = *in_ptr;
    
      262142
              tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
    
      262142
              tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
    
      262142
              tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
    
      262142
                    ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
    
      262142
              tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
    
      262142
              tmp = (tmp << 16) | (tmp >> 16);
    
      262142
              *out_ptr = tmp;
    
      262142
              ++in_ptr;
    
      262142
              ++out_ptr;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEONV8
    
      #include <arm_neon.h>
    
      static inline void
    
      volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
    
      {
    
          const uint32_t* in_ptr = in;
    
          uint32_t* out_ptr = out;
    
          const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
    
          const unsigned int quarterPoints = num_points / 4;
    
          unsigned int number = 0;
    
          for (; number < quarterPoints; ++number) {
    
              __VOLK_PREFETCH(in_ptr + 4);
    
              uint32x4_t x = vld1q_u32(in_ptr);
    
              uint32x4_t z =
    
                  vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
    
              vst1q_u32(out_ptr, z);
    
              in_ptr += 4;
    
              out_ptr += 4;
    
          }
    
          number = quarterPoints * 4;
    
          for (; number < num_points; ++number) {
    
              *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) |
    
                         (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
    
                         (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
    
                         (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
    
              ++in_ptr;
    
              ++out_ptr;
    
          }
    
      }
    
      #endif /* LV_HAVE_NEONV8 */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      #if defined(__aarch64__)
    
      #define DO_RBIT                             \
    
          __VOLK_ASM("rbit %w[result], %w[value]" \
    
                     : [result] "=r"(*out_ptr)    \
    
                     : [value] "r"(*in_ptr)       \
    
                     :);                          \
    
          in_ptr++;                               \
    
          out_ptr++;
    
      #else
    
      #define DO_RBIT                           \
    
          __VOLK_ASM("rbit %[result], %[value]" \
    
                     : [result] "=r"(*out_ptr)  \
    
                     : [value] "r"(*in_ptr)     \
    
                     :);                        \
    
          in_ptr++;                             \
    
          out_ptr++;
    
      #endif
    
      static inline void
    
      volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
    
      {
    
          const uint32_t* in_ptr = in;
    
          uint32_t* out_ptr = out;
    
          const unsigned int eighthPoints = num_points / 8;
    
          unsigned int number = 0;
    
          for (; number < eighthPoints; ++number) {
    
              __VOLK_PREFETCH(in_ptr + 8);
    
              DO_RBIT;
    
              DO_RBIT;
    
              DO_RBIT;
    
              DO_RBIT;
    
              DO_RBIT;
    
              DO_RBIT;
    
              DO_RBIT;
    
              DO_RBIT;
    
          }
    
          number = eighthPoints * 8;
    
          for (; number < num_points; ++number) {
    
              DO_RBIT;
    
          }
    
      }
    
      #undef DO_RBIT
    
      #endif /* LV_HAVE_NEON */
    
      #endif /* INCLUDED_volk_32u_reverse_32u_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2018 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32u_reverse_32u
12			*
13			* \b bit reversal of the input 32 bit word
14
15			* <b>Dispatcher Prototype</b>
16			* \code volk_32u_reverse_32u(uint32_t outputVector, uint32_t inputVector; unsigned int
17			num_points);
18			* \endcode
19			*
20			* \b Inputs
21			* \li inputVector: The input vector
22			* \li num_points The number of data points.
23			*
24			* \b Outputs
25			* \li outputVector: The vector where the results will be stored, which is the
26			bit-reversed input
27			*
28			* \endcode
29			*/
30			#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
31			struct dword_split {
32			int b00 : 1;
33			int b01 : 1;
34			int b02 : 1;
35			int b03 : 1;
36			int b04 : 1;
37			int b05 : 1;
38			int b06 : 1;
39			int b07 : 1;
40			int b08 : 1;
41			int b09 : 1;
42			int b10 : 1;
43			int b11 : 1;
44			int b12 : 1;
45			int b13 : 1;
46			int b14 : 1;
47			int b15 : 1;
48			int b16 : 1;
49			int b17 : 1;
50			int b18 : 1;
51			int b19 : 1;
52			int b20 : 1;
53			int b21 : 1;
54			int b22 : 1;
55			int b23 : 1;
56			int b24 : 1;
57			int b25 : 1;
58			int b26 : 1;
59			int b27 : 1;
60			int b28 : 1;
61			int b29 : 1;
62			int b30 : 1;
63			int b31 : 1;
64			};
65			struct char_split {
66			uint8_t b00 : 1;
67			uint8_t b01 : 1;
68			uint8_t b02 : 1;
69			uint8_t b03 : 1;
70			uint8_t b04 : 1;
71			uint8_t b05 : 1;
72			uint8_t b06 : 1;
73			uint8_t b07 : 1;
74			};
75
76			// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
77			// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
78			static const unsigned char BitReverseTable256[] = {
79			0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
80			0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
81			0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
82			0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
83			0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
84			0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
85			0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
86			0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
87			0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
88			0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
89			0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
90			0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
91			0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
92			0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
93			0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
94			0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
95			0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
96			0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
97			0x3F, 0xBF, 0x7F, 0xFF
98			};
99			#ifdef LV_HAVE_GENERIC
100		2	static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
101			const uint32_t* in,
102			unsigned int num_points)
103			{
104		2	const struct dword_split* in_ptr = (const struct dword_split*)in;
105		2	struct dword_split* out_ptr = (struct dword_split*)out;
106		2	unsigned int number = 0;
107	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
108		262142	out_ptr->b00 = in_ptr->b31;
109		262142	out_ptr->b01 = in_ptr->b30;
110		262142	out_ptr->b02 = in_ptr->b29;
111		262142	out_ptr->b03 = in_ptr->b28;
112		262142	out_ptr->b04 = in_ptr->b27;
113		262142	out_ptr->b05 = in_ptr->b26;
114		262142	out_ptr->b06 = in_ptr->b25;
115		262142	out_ptr->b07 = in_ptr->b24;
116		262142	out_ptr->b08 = in_ptr->b23;
117		262142	out_ptr->b09 = in_ptr->b22;
118		262142	out_ptr->b10 = in_ptr->b21;
119		262142	out_ptr->b11 = in_ptr->b20;
120		262142	out_ptr->b12 = in_ptr->b19;
121		262142	out_ptr->b13 = in_ptr->b18;
122		262142	out_ptr->b14 = in_ptr->b17;
123		262142	out_ptr->b15 = in_ptr->b16;
124		262142	out_ptr->b16 = in_ptr->b15;
125		262142	out_ptr->b17 = in_ptr->b14;
126		262142	out_ptr->b18 = in_ptr->b13;
127		262142	out_ptr->b19 = in_ptr->b12;
128		262142	out_ptr->b20 = in_ptr->b11;
129		262142	out_ptr->b21 = in_ptr->b10;
130		262142	out_ptr->b22 = in_ptr->b09;
131		262142	out_ptr->b23 = in_ptr->b08;
132		262142	out_ptr->b24 = in_ptr->b07;
133		262142	out_ptr->b25 = in_ptr->b06;
134		262142	out_ptr->b26 = in_ptr->b05;
135		262142	out_ptr->b27 = in_ptr->b04;
136		262142	out_ptr->b28 = in_ptr->b03;
137		262142	out_ptr->b29 = in_ptr->b02;
138		262142	out_ptr->b30 = in_ptr->b01;
139		262142	out_ptr->b31 = in_ptr->b00;
140		262142	++in_ptr;
141		262142	++out_ptr;
142			}
143		2	}
144			#endif /* LV_HAVE_GENERIC */
145
146			#ifdef LV_HAVE_GENERIC
147		2	static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
148			const uint32_t* in,
149			unsigned int num_points)
150			{
151		2	const uint32_t* in_ptr = in;
152		2	uint32_t* out_ptr = out;
153		2	unsigned int number = 0;
154	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
155		262142	const struct char_split* in8 = (const struct char_split*)in_ptr;
156		262142	struct char_split* out8 = (struct char_split*)out_ptr;
157
158		262142	out8[3].b00 = in8[0].b07;
159		262142	out8[3].b01 = in8[0].b06;
160		262142	out8[3].b02 = in8[0].b05;
161		262142	out8[3].b03 = in8[0].b04;
162		262142	out8[3].b04 = in8[0].b03;
163		262142	out8[3].b05 = in8[0].b02;
164		262142	out8[3].b06 = in8[0].b01;
165		262142	out8[3].b07 = in8[0].b00;
166
167		262142	out8[2].b00 = in8[1].b07;
168		262142	out8[2].b01 = in8[1].b06;
169		262142	out8[2].b02 = in8[1].b05;
170		262142	out8[2].b03 = in8[1].b04;
171		262142	out8[2].b04 = in8[1].b03;
172		262142	out8[2].b05 = in8[1].b02;
173		262142	out8[2].b06 = in8[1].b01;
174		262142	out8[2].b07 = in8[1].b00;
175
176		262142	out8[1].b00 = in8[2].b07;
177		262142	out8[1].b01 = in8[2].b06;
178		262142	out8[1].b02 = in8[2].b05;
179		262142	out8[1].b03 = in8[2].b04;
180		262142	out8[1].b04 = in8[2].b03;
181		262142	out8[1].b05 = in8[2].b02;
182		262142	out8[1].b06 = in8[2].b01;
183		262142	out8[1].b07 = in8[2].b00;
184
185		262142	out8[0].b00 = in8[3].b07;
186		262142	out8[0].b01 = in8[3].b06;
187		262142	out8[0].b02 = in8[3].b05;
188		262142	out8[0].b03 = in8[3].b04;
189		262142	out8[0].b04 = in8[3].b03;
190		262142	out8[0].b05 = in8[3].b02;
191		262142	out8[0].b06 = in8[3].b01;
192		262142	out8[0].b07 = in8[3].b00;
193		262142	++in_ptr;
194		262142	++out_ptr;
195			}
196		2	}
197			#endif /* LV_HAVE_GENERIC */
198
199			// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
200			// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
201			#ifdef LV_HAVE_GENERIC
202			static inline void
203		2	volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
204			{
205		2	const uint32_t* in_ptr = in;
206		2	uint32_t* out_ptr = out;
207		2	unsigned int number = 0;
208	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
209		262142	out_ptr = ((uint32_t)BitReverseTable256[in_ptr & 0xff] << 24) \|
210		262142	(BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) \|
211		262142	(BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) \|
212		262142	(BitReverseTable256[(*in_ptr >> 24) & 0xff]);
213		262142	++in_ptr;
214		262142	++out_ptr;
215			}
216		2	}
217			#endif /* LV_HAVE_GENERIC */
218
219			// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
220			// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
221			#ifdef LV_HAVE_GENERIC
222			static inline void
223		2	volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
224			{
225		2	const uint32_t* in_ptr = in;
226		2	uint32_t* out_ptr = out;
227			const uint8_t* in8;
228			uint8_t* out8;
229		2	unsigned int number = 0;
230	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
231		262142	in8 = (const uint8_t*)in_ptr;
232		262142	out8 = (uint8_t*)out_ptr;
233		262142	out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
234		262142	out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
235		262142	out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
236		262142	out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
237		262142	++in_ptr;
238		262142	++out_ptr;
239			}
240		2	}
241			#endif /* LV_HAVE_GENERIC */
242
243			#ifdef LV_HAVE_GENERIC
244			// Current gr-pager implementation
245			static inline void
246		2	volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
247			{
248		2	const uint32_t* in_ptr = in;
249		2	uint32_t* out_ptr = out;
250			const uint8_t* in8;
251			uint8_t* out8;
252		2	unsigned int number = 0;
253	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
254		262142	in8 = (const uint8_t*)in_ptr;
255		262142	out8 = (uint8_t*)out_ptr;
256		262142	out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
257		262142	out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
258		262142	out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
259		262142	out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
260		262142	++in_ptr;
261		262142	++out_ptr;
262			}
263		2	}
264			#endif /* LV_HAVE_GENERIC */
265
266			// After lengthy thought and quite a bit of whiteboarding:
267			#ifdef LV_HAVE_GENERIC
268		2	static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
269			const uint32_t* in,
270			unsigned int num_points)
271			{
272		2	const uint32_t* in_ptr = in;
273		2	uint32_t* out_ptr = out;
274		2	unsigned int number = 0;
275	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
276		262142	uint32_t tmp = *in_ptr;
277			/* permute uint16:
278			The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
279			*/
280		262142	tmp = (tmp << 16) \| (tmp >> 16);
281			/* permute bytes:
282			shift up by 1 B first, then only consider even bytes, and OR with the unshifted
283			even bytes
284			*/
285		262142	tmp = ((tmp & (0xFF \| 0xFF << 16)) << 8) \| ((tmp >> 8) & (0xFF \| 0xFF << 16));
286			/* permute 4bit tuples:
287			Same idea, but the "consideration" mask expression becomes unwieldy
288			*/
289		262142	tmp = ((tmp & (0xF \| 0xF << 8 \| 0xF << 16 \| 0xF << 24)) << 4) \|
290		262142	((tmp >> 4) & (0xF \| 0xF << 8 \| 0xF << 16 \| 0xF << 24));
291			/* permute 2bit tuples:
292			Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
293			3; we need those every 4b, which coincides with a hex digit!
294			*/
295		262142	tmp = ((tmp & (0x33333333)) << 2) \| ((tmp >> 2) & (0x33333333));
296			/* permute odd/even:
297			0x01 = 0x1; we need these every 2b, which works out: 0x01 \| (0x01 << 2) =
298			0x05!
299			*/
300		262142	tmp = ((tmp & (0x55555555)) << 1) \| ((tmp >> 1) & (0x55555555));
301
302		262142	*out_ptr = tmp;
303		262142	++in_ptr;
304		262142	++out_ptr;
305			}
306		2	}
307			#endif /* LV_HAVE_GENERIC */
308			#ifdef LV_HAVE_GENERIC
309		2	static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
310			const uint32_t* in,
311			unsigned int num_points)
312			{
313			// same stuff as top_down, inverted order (permutation matrices don't care, you know!)
314		2	const uint32_t* in_ptr = in;
315		2	uint32_t* out_ptr = out;
316		2	unsigned int number = 0;
317	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; ++number) {
318		262142	uint32_t tmp = *in_ptr;
319		262142	tmp = ((tmp & (0x55555555)) << 1) \| ((tmp >> 1) & (0x55555555));
320		262142	tmp = ((tmp & (0x33333333)) << 2) \| ((tmp >> 2) & (0x33333333));
321		262142	tmp = ((tmp & (0xF \| 0xF << 8 \| 0xF << 16 \| 0xF << 24)) << 4) \|
322		262142	((tmp >> 4) & (0xF \| 0xF << 8 \| 0xF << 16 \| 0xF << 24));
323		262142	tmp = ((tmp & (0xFF \| 0xFF << 16)) << 8) \| ((tmp >> 8) & (0xFF \| 0xFF << 16));
324		262142	tmp = (tmp << 16) \| (tmp >> 16);
325
326		262142	*out_ptr = tmp;
327		262142	++in_ptr;
328		262142	++out_ptr;
329			}
330		2	}
331			#endif /* LV_HAVE_GENERIC */
332
333			#ifdef LV_HAVE_NEONV8
334			#include <arm_neon.h>
335
336			static inline void
337			volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
338			{
339			const uint32_t* in_ptr = in;
340			uint32_t* out_ptr = out;
341
342			const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
343
344			const unsigned int quarterPoints = num_points / 4;
345			unsigned int number = 0;
346			for (; number < quarterPoints; ++number) {
347			__VOLK_PREFETCH(in_ptr + 4);
348			uint32x4_t x = vld1q_u32(in_ptr);
349			uint32x4_t z =
350			vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
351			vst1q_u32(out_ptr, z);
352			in_ptr += 4;
353			out_ptr += 4;
354			}
355			number = quarterPoints * 4;
356			for (; number < num_points; ++number) {
357			out_ptr = ((uint32_t)BitReverseTable256[in_ptr & 0xff] << 24) \|
358			(BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) \|
359			(BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) \|
360			(BitReverseTable256[(*in_ptr >> 24) & 0xff]);
361			++in_ptr;
362			++out_ptr;
363			}
364			}
365
366			#endif /* LV_HAVE_NEONV8 */
367
368			#ifdef LV_HAVE_NEON
369			#include <arm_neon.h>
370
371			#if defined(__aarch64__)
372			#define DO_RBIT \
373			__VOLK_ASM("rbit %w[result], %w[value]" \
374			: [result] "=r"(*out_ptr) \
375			: [value] "r"(*in_ptr) \
376			:); \
377			in_ptr++; \
378			out_ptr++;
379			#else
380			#define DO_RBIT \
381			__VOLK_ASM("rbit %[result], %[value]" \
382			: [result] "=r"(*out_ptr) \
383			: [value] "r"(*in_ptr) \
384			:); \
385			in_ptr++; \
386			out_ptr++;
387			#endif
388
389			static inline void
390			volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
391			{
392
393			const uint32_t* in_ptr = in;
394			uint32_t* out_ptr = out;
395			const unsigned int eighthPoints = num_points / 8;
396			unsigned int number = 0;
397			for (; number < eighthPoints; ++number) {
398			__VOLK_PREFETCH(in_ptr + 8);
399			DO_RBIT;
400			DO_RBIT;
401			DO_RBIT;
402			DO_RBIT;
403			DO_RBIT;
404			DO_RBIT;
405			DO_RBIT;
406			DO_RBIT;
407			}
408			number = eighthPoints * 8;
409			for (; number < num_points; ++number) {
410			DO_RBIT;
411			}
412			}
413			#undef DO_RBIT
414			#endif /* LV_HAVE_NEON */
415
416
417			#endif /* INCLUDED_volk_32u_reverse_32u_u_H */
418