GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_index_max_16u.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	183	191	95.8%
Functions:	6	6	100.0%
Branches:	50	62	80.6%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_index_max_16u
    
       *
    
       * \b Overview
    
       *
    
       * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
    
       * maximum magnitude for complex points in the given vector.
    
       *
    
       * Note that num_points is a uint32_t, but the return value is
    
       * uint16_t. Providing a vector larger than the max of a uint16_t
    
       * (65536) would miss anything outside of this boundary. The kernel
    
       * will check the length of num_points and cap it to this max value,
    
       * anyways.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
    
       * num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li src0: The complex input vector.
    
       * \li num_points: The number of samples.
    
       *
    
       * \b Outputs
    
       * \li target: The index of the point with maximum magnitude.
    
       *
    
       * \b Example
    
       * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
    
       * the unit circle.
    
       * \code
    
       *   int N = 10;
    
       *   uint32_t alignment = volk_get_alignment();
    
       *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
    
       *
    
       *   for(uint32_t ii = 0; ii < N/2; ++ii){
    
       *       float real = 2.f * ((float)ii / (float)N) - 1.f;
    
       *       float imag = std::sqrt(1.f - real * real);
    
       *       in[ii] = lv_cmake(real, imag);
    
       *       in[ii] = in[ii] * in[ii] + in[ii];
    
       *       in[N-ii] = lv_cmake(real, imag);
    
       *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
    
       *   }
    
       *
    
       *   volk_32fc_index_max_16u(max, in, N);
    
       *
    
       *   printf("index of max value = %u\n",  *max);
    
       *
    
       *   volk_free(in);
    
       *   volk_free(max);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
    
      #define INCLUDED_volk_32fc_index_max_16u_a_H
    
      #include <inttypes.h>
    
      #include <limits.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_load_ps((float*)src0);
    
      16382
              __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
    
      16382
              vector_32fc_index_max_variant0(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      16382
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.

      16
              if (max_values_buffer[i] > max) {
    
      8
                  max = max_values_buffer[i];
    
      8
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_load_ps((float*)src0);
    
      16382
              __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
    
      16382
              vector_32fc_index_max_variant1(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      16382
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.

      16
              if (max_values_buffer[i] > max) {
    
      8
                  max = max_values_buffer[i];
    
      8
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <xmmintrin.h>
    
      static inline void
    
      2
      volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const uint32_t num_bytes = num_points * 8;
    
          union bit128 holderf;
    
          union bit128 holderi;
    
      2
          float sq_dist = 0.0;
    
          union bit128 xmm5, xmm4;
    
          __m128 xmm1, xmm2, xmm3;
    
          __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
    
      2
          xmm5.int_vec = _mm_setzero_si128();
    
      2
          xmm4.int_vec = _mm_setzero_si128();
    
      2
          holderf.int_vec = _mm_setzero_si128();
    
      2
          holderi.int_vec = _mm_setzero_si128();
    
      2
          int bound = num_bytes >> 5;
    
      2
          int i = 0;
    
      2
          xmm8 = _mm_setr_epi32(0, 1, 2, 3);
    
      2
          xmm9 = _mm_setzero_si128();
    
      2
          xmm10 = _mm_setr_epi32(4, 4, 4, 4);
    
      2
          xmm3 = _mm_setzero_ps();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; i < bound; ++i) {
    
      32766
              xmm1 = _mm_load_ps((float*)src0);
    
      32766
              xmm2 = _mm_load_ps((float*)&src0[2]);
    
      32766
              src0 += 4;
    
      65532
              xmm1 = _mm_mul_ps(xmm1, xmm1);
    
      32766
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      32766
              xmm1 = _mm_hadd_ps(xmm1, xmm2);
    
      32766
              xmm3 = _mm_max_ps(xmm1, xmm3);
    
      32766
              xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
    
      32766
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      32766
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      65532
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      32766
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      65532
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_bytes >> 4 & 1) {
    
      2
              xmm2 = _mm_load_ps((float*)src0);
    
      2
              xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
    
      2
              xmm8 = bit128_p(&xmm1)->int_vec;
    
      2
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      2
              src0 += 2;
    
      2
              xmm1 = _mm_hadd_ps(xmm2, xmm2);
    
      4
              xmm3 = _mm_max_ps(xmm1, xmm3);
    
      2
              xmm10 = _mm_setr_epi32(2, 2, 2, 2);
    
      2
              xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      4
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_bytes >> 3 & 1) {
    
      2
              sq_dist =
    
      2
                  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
    
      2
              xmm2 = _mm_load1_ps(&sq_dist);
    
      2
              xmm1 = xmm3;
    
      2
              xmm3 = _mm_max_ss(xmm3, xmm2);
    
      2
              xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
          }
    
          _mm_store_ps((float*)&(holderf.f), xmm3);
    
          _mm_store_si128(&(holderi.int_vec), xmm9);
    
      2
          target[0] = holderi.i[0];
    
      2
          sq_dist = holderf.f[0];
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
    
      2
      }
    
      #endif /*LV_HAVE_SSE3*/
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const uint32_t num_bytes = num_points * 8;
    
      2
          float sq_dist = 0.0;
    
      2
          float max = 0.0;
    
      2
          uint16_t index = 0;
    
      2
          uint32_t i = 0;
    
        2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.

      131072
          for (; i<num_bytes>> 3; ++i) {
    
      131070
              sq_dist =
    
      131070
                  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
    
        2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 131042 times.

      131070
              if (sq_dist > max) {
    
      28
                  index = i;
    
      28
                  max = sq_dist;
    
              }
    
          }
    
      2
          target[0] = index;
    
      2
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
    
      #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
    
      #define INCLUDED_volk_32fc_index_max_16u_u_H
    
      #include <inttypes.h>
    
      #include <limits.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_loadu_ps((float*)src0);
    
      16382
              __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
    
      16382
              vector_32fc_index_max_variant0(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      16382
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.

      16
              if (max_values_buffer[i] > max) {
    
      8
                  max = max_values_buffer[i];
    
      8
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_loadu_ps((float*)src0);
    
      16382
              __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
    
      16382
              vector_32fc_index_max_variant1(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      16382
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.

      16
              if (max_values_buffer[i] > max) {
    
      8
                  max = max_values_buffer[i];
    
      8
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_index_max_16u
12			*
13			* \b Overview
14			*
15			* Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
16			* maximum magnitude for complex points in the given vector.
17			*
18			* Note that num_points is a uint32_t, but the return value is
19			* uint16_t. Providing a vector larger than the max of a uint16_t
20			* (65536) would miss anything outside of this boundary. The kernel
21			* will check the length of num_points and cap it to this max value,
22			* anyways.
23			*
24			* <b>Dispatcher Prototype</b>
25			* \code
26			* void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
27			* num_points) \endcode
28			*
29			* \b Inputs
30			* \li src0: The complex input vector.
31			* \li num_points: The number of samples.
32			*
33			* \b Outputs
34			* \li target: The index of the point with maximum magnitude.
35			*
36			* \b Example
37			* Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
38			* the unit circle.
39			* \code
40			* int N = 10;
41			* uint32_t alignment = volk_get_alignment();
42			* lv_32fc_t* in = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
43			* uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
44			*
45			* for(uint32_t ii = 0; ii < N/2; ++ii){
46			* float real = 2.f * ((float)ii / (float)N) - 1.f;
47			* float imag = std::sqrt(1.f - real * real);
48			* in[ii] = lv_cmake(real, imag);
49			* in[ii] = in[ii] * in[ii] + in[ii];
50			* in[N-ii] = lv_cmake(real, imag);
51			* in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
52			* }
53			*
54			* volk_32fc_index_max_16u(max, in, N);
55			*
56			* printf("index of max value = %u\n", *max);
57			*
58			* volk_free(in);
59			* volk_free(max);
60			* \endcode
61			*/
62
63			#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64			#define INCLUDED_volk_32fc_index_max_16u_a_H
65
66			#include <inttypes.h>
67			#include <limits.h>
68			#include <stdio.h>
69			#include <volk/volk_common.h>
70			#include <volk/volk_complex.h>
71
72			#ifdef LV_HAVE_AVX2
73			#include <immintrin.h>
74			#include <volk/volk_avx2_intrinsics.h>
75
76		2	static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77			lv_32fc_t* src0,
78			uint32_t num_points)
79			{
80		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82		2	const __m256i indices_increment = _mm256_set1_epi32(8);
83			/*
84			* At the start of each loop iteration current_indices holds the indices of
85			* the complex numbers loaded from memory. Explanation for odd order is given
86			* in implementation of vector_32fc_index_max_variant0().
87			*/
88		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90		2	__m256 max_values = _mm256_setzero_ps();
91		2	__m256i max_indices = _mm256_setzero_si256();
92
93	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
94		16382	__m256 in0 = _mm256_load_ps((float*)src0);
95		16382	__m256 in1 = _mm256_load_ps((float*)(src0 + 4));
96		16382	vector_32fc_index_max_variant0(
97			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98		16382	src0 += 8;
99			}
100
101			// determine maximum value and index in the result of the vectorized loop
102			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104		2	_mm256_store_ps(max_values_buffer, max_values);
105		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106
107		2	float max = 0.f;
108		2	uint32_t index = 0;
109	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
110	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 8 times.	16	if (max_values_buffer[i] > max) {
111		8	max = max_values_buffer[i];
112		8	index = max_indices_buffer[i];
113			}
114			}
115
116			// handle tail not processed by the vectorized loop
117	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118		14	const float abs_squared =
119		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
120	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
121		✗	max = abs_squared;
122		✗	index = i;
123			}
124		14	++src0;
125			}
126
127		2	*target = index;
128		2	}
129
130			#endif /LV_HAVE_AVX2/
131
132			#ifdef LV_HAVE_AVX2
133			#include <immintrin.h>
134			#include <volk/volk_avx2_intrinsics.h>
135
136		2	static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137			lv_32fc_t* src0,
138			uint32_t num_points)
139			{
140		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142		2	const __m256i indices_increment = _mm256_set1_epi32(8);
143			/*
144			* At the start of each loop iteration current_indices holds the indices of
145			* the complex numbers loaded from memory. Explanation for odd order is given
146			* in implementation of vector_32fc_index_max_variant0().
147			*/
148		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150		2	__m256 max_values = _mm256_setzero_ps();
151		2	__m256i max_indices = _mm256_setzero_si256();
152
153	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
154		16382	__m256 in0 = _mm256_load_ps((float*)src0);
155		16382	__m256 in1 = _mm256_load_ps((float*)(src0 + 4));
156		16382	vector_32fc_index_max_variant1(
157			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158		16382	src0 += 8;
159			}
160
161			// determine maximum value and index in the result of the vectorized loop
162			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164		2	_mm256_store_ps(max_values_buffer, max_values);
165		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166
167		2	float max = 0.f;
168		2	uint32_t index = 0;
169	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
170	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 8 times.	16	if (max_values_buffer[i] > max) {
171		8	max = max_values_buffer[i];
172		8	index = max_indices_buffer[i];
173			}
174			}
175
176			// handle tail not processed by the vectorized loop
177	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178		14	const float abs_squared =
179		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
180	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
181		✗	max = abs_squared;
182		✗	index = i;
183			}
184		14	++src0;
185			}
186
187		2	*target = index;
188		2	}
189
190			#endif /LV_HAVE_AVX2/
191
192			#ifdef LV_HAVE_SSE3
193			#include <pmmintrin.h>
194			#include <xmmintrin.h>
195
196			static inline void
197		2	volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
198			{
199		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
200		2	const uint32_t num_bytes = num_points * 8;
201
202			union bit128 holderf;
203			union bit128 holderi;
204		2	float sq_dist = 0.0;
205
206			union bit128 xmm5, xmm4;
207			__m128 xmm1, xmm2, xmm3;
208			__m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210		2	xmm5.int_vec = _mm_setzero_si128();
211		2	xmm4.int_vec = _mm_setzero_si128();
212		2	holderf.int_vec = _mm_setzero_si128();
213		2	holderi.int_vec = _mm_setzero_si128();
214
215		2	int bound = num_bytes >> 5;
216		2	int i = 0;
217
218		2	xmm8 = _mm_setr_epi32(0, 1, 2, 3);
219		2	xmm9 = _mm_setzero_si128();
220		2	xmm10 = _mm_setr_epi32(4, 4, 4, 4);
221		2	xmm3 = _mm_setzero_ps();
222
223	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; i < bound; ++i) {
224		32766	xmm1 = _mm_load_ps((float*)src0);
225		32766	xmm2 = _mm_load_ps((float*)&src0[2]);
226
227		32766	src0 += 4;
228
229		65532	xmm1 = _mm_mul_ps(xmm1, xmm1);
230		32766	xmm2 = _mm_mul_ps(xmm2, xmm2);
231
232		32766	xmm1 = _mm_hadd_ps(xmm1, xmm2);
233
234		32766	xmm3 = _mm_max_ps(xmm1, xmm3);
235
236		32766	xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
237		32766	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
238
239		32766	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
240		65532	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
241
242		32766	xmm9 = _mm_add_epi32(xmm11, xmm12);
243
244		65532	xmm8 = _mm_add_epi32(xmm8, xmm10);
245			}
246
247	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_bytes >> 4 & 1) {
248		2	xmm2 = _mm_load_ps((float*)src0);
249
250		2	xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
251		2	xmm8 = bit128_p(&xmm1)->int_vec;
252
253		2	xmm2 = _mm_mul_ps(xmm2, xmm2);
254
255		2	src0 += 2;
256
257		2	xmm1 = _mm_hadd_ps(xmm2, xmm2);
258
259		4	xmm3 = _mm_max_ps(xmm1, xmm3);
260
261		2	xmm10 = _mm_setr_epi32(2, 2, 2, 2);
262
263		2	xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
264		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
265
266		2	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
267		4	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
268
269		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
270
271		4	xmm8 = _mm_add_epi32(xmm8, xmm10);
272			}
273
274	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_bytes >> 3 & 1) {
275		2	sq_dist =
276		2	lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
277
278		2	xmm2 = _mm_load1_ps(&sq_dist);
279
280		2	xmm1 = xmm3;
281
282		2	xmm3 = _mm_max_ss(xmm3, xmm2);
283
284		2	xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
285		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
286
287		2	xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
288
289		2	xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
290		4	xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
291
292		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
293			}
294
295			_mm_store_ps((float*)&(holderf.f), xmm3);
296			_mm_store_si128(&(holderi.int_vec), xmm9);
297
298		2	target[0] = holderi.i[0];
299		2	sq_dist = holderf.f[0];
300	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
301	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
302	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
303	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
304	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
305	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
306		2	}
307
308			#endif /LV_HAVE_SSE3/
309
310			#ifdef LV_HAVE_GENERIC
311			static inline void
312		2	volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
313			{
314		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315
316		2	const uint32_t num_bytes = num_points * 8;
317
318		2	float sq_dist = 0.0;
319		2	float max = 0.0;
320		2	uint16_t index = 0;
321
322		2	uint32_t i = 0;
323
324	2/2 ✓ Branch 0 taken 131070 times. ✓ Branch 1 taken 2 times.	131072	for (; i<num_bytes>> 3; ++i) {
325		131070	sq_dist =
326		131070	lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
327
328	2/2 ✓ Branch 0 taken 28 times. ✓ Branch 1 taken 131042 times.	131070	if (sq_dist > max) {
329		28	index = i;
330		28	max = sq_dist;
331			}
332			}
333		2	target[0] = index;
334		2	}
335
336			#endif /LV_HAVE_GENERIC/
337
338			#endif /INCLUDED_volk_32fc_index_max_16u_a_H/
339
340			#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
341			#define INCLUDED_volk_32fc_index_max_16u_u_H
342
343			#include <inttypes.h>
344			#include <limits.h>
345			#include <stdio.h>
346			#include <volk/volk_common.h>
347			#include <volk/volk_complex.h>
348
349			#ifdef LV_HAVE_AVX2
350			#include <immintrin.h>
351			#include <volk/volk_avx2_intrinsics.h>
352
353		2	static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
354			lv_32fc_t* src0,
355			uint32_t num_points)
356			{
357		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
358
359		2	const __m256i indices_increment = _mm256_set1_epi32(8);
360			/*
361			* At the start of each loop iteration current_indices holds the indices of
362			* the complex numbers loaded from memory. Explanation for odd order is given
363			* in implementation of vector_32fc_index_max_variant0().
364			*/
365		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
366
367		2	__m256 max_values = _mm256_setzero_ps();
368		2	__m256i max_indices = _mm256_setzero_si256();
369
370	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
371		16382	__m256 in0 = _mm256_loadu_ps((float*)src0);
372		16382	__m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
373		16382	vector_32fc_index_max_variant0(
374			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
375		16382	src0 += 8;
376			}
377
378			// determine maximum value and index in the result of the vectorized loop
379			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
380			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
381		2	_mm256_store_ps(max_values_buffer, max_values);
382		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
383
384		2	float max = 0.f;
385		2	uint32_t index = 0;
386	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
387	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 8 times.	16	if (max_values_buffer[i] > max) {
388		8	max = max_values_buffer[i];
389		8	index = max_indices_buffer[i];
390			}
391			}
392
393			// handle tail not processed by the vectorized loop
394	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
395		14	const float abs_squared =
396		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
397	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
398		✗	max = abs_squared;
399		✗	index = i;
400			}
401		14	++src0;
402			}
403
404		2	*target = index;
405		2	}
406
407			#endif /LV_HAVE_AVX2/
408
409			#ifdef LV_HAVE_AVX2
410			#include <immintrin.h>
411			#include <volk/volk_avx2_intrinsics.h>
412
413		2	static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
414			lv_32fc_t* src0,
415			uint32_t num_points)
416			{
417		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
418
419		2	const __m256i indices_increment = _mm256_set1_epi32(8);
420			/*
421			* At the start of each loop iteration current_indices holds the indices of
422			* the complex numbers loaded from memory. Explanation for odd order is given
423			* in implementation of vector_32fc_index_max_variant0().
424			*/
425		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
426
427		2	__m256 max_values = _mm256_setzero_ps();
428		2	__m256i max_indices = _mm256_setzero_si256();
429
430	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
431		16382	__m256 in0 = _mm256_loadu_ps((float*)src0);
432		16382	__m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
433		16382	vector_32fc_index_max_variant1(
434			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
435		16382	src0 += 8;
436			}
437
438			// determine maximum value and index in the result of the vectorized loop
439			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
440			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
441		2	_mm256_store_ps(max_values_buffer, max_values);
442		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
443
444		2	float max = 0.f;
445		2	uint32_t index = 0;
446	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
447	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 8 times.	16	if (max_values_buffer[i] > max) {
448		8	max = max_values_buffer[i];
449		8	index = max_indices_buffer[i];
450			}
451			}
452
453			// handle tail not processed by the vectorized loop
454	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
455		14	const float abs_squared =
456		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
457	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
458		✗	max = abs_squared;
459		✗	index = i;
460			}
461		14	++src0;
462			}
463
464		2	*target = index;
465		2	}
466
467			#endif /LV_HAVE_AVX2/
468
469			#endif /INCLUDED_volk_32fc_index_max_16u_u_H/
470