GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_index_min_16u.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	179	187	95.7%
Functions:	6	6	100.0%
Branches:	54	62	87.1%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2021 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_index_min_16u
    
       *
    
       * \b Overview
    
       *
    
       * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
    
       * minimum magnitude for complex points in the given vector.
    
       *
    
       * Note that num_points is a uint32_t, but the return value is
    
       * uint16_t. Providing a vector larger than the max of a uint16_t
    
       * (65536) would miss anything outside of this boundary. The kernel
    
       * will check the length of num_points and cap it to this max value,
    
       * anyways.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t
    
       * num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li source: The complex input vector.
    
       * \li num_points: The number of samples.
    
       *
    
       * \b Outputs
    
       * \li target: The index of the point with minimum magnitude.
    
       *
    
       * \b Example
    
       * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
    
       * the unit circle.
    
       * \code
    
       *   int N = 10;
    
       *   uint32_t alignment = volk_get_alignment();
    
       *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
    
       *
    
       *   for(uint32_t ii = 0; ii < N/2; ++ii){
    
       *       float real = 2.f * ((float)ii / (float)N) - 1.f;
    
       *       float imag = std::sqrt(1.f - real * real);
    
       *       in[ii] = lv_cmake(real, imag);
    
       *       in[ii] = in[ii] * in[ii] + in[ii];
    
       *       in[N-ii] = lv_cmake(real, imag);
    
       *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
    
       *   }
    
       *
    
       *   volk_32fc_index_min_16u(min, in, N);
    
       *
    
       *   printf("index of min value = %u\n",  *min);
    
       *
    
       *   volk_free(in);
    
       *   volk_free(min);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
    
      #define INCLUDED_volk_32fc_index_min_16u_a_H
    
      #include <inttypes.h>
    
      #include <limits.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_load_ps((float*)source);
    
      16382
              __m256 in1 = _mm256_load_ps((float*)(source + 4));
    
      16382
              vector_32fc_index_min_variant0(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      16382
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (min_values_buffer[i] < min) {
    
      4
                  min = min_values_buffer[i];
    
      4
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_load_ps((float*)source);
    
      16382
              __m256 in1 = _mm256_load_ps((float*)(source + 4));
    
      16382
              vector_32fc_index_min_variant1(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      16382
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (min_values_buffer[i] < min) {
    
      4
                  min = min_values_buffer[i];
    
      4
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
    
                                                        const lv_32fc_t* source,
    
                                                        uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
          union bit128 holderf;
    
          union bit128 holderi;
    
      2
          float sq_dist = 0.0;
    
          union bit128 xmm5, xmm4;
    
          __m128 xmm1, xmm2, xmm3;
    
          __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
    
      2
          xmm5.int_vec = _mm_setzero_si128();
    
      2
          xmm4.int_vec = _mm_setzero_si128();
    
      2
          holderf.int_vec = _mm_setzero_si128();
    
      2
          holderi.int_vec = _mm_setzero_si128();
    
      2
          xmm8 = _mm_setr_epi32(0, 1, 2, 3);
    
      2
          xmm9 = _mm_setzero_si128();
    
      2
          xmm10 = _mm_setr_epi32(4, 4, 4, 4);
    
      2
          xmm3 = _mm_set_ps1(FLT_MAX);
    
      2
          int bound = num_points >> 2;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (int i = 0; i < bound; ++i) {
    
      32766
              xmm1 = _mm_load_ps((float*)source);
    
      32766
              xmm2 = _mm_load_ps((float*)&source[2]);
    
      32766
              source += 4;
    
      65532
              xmm1 = _mm_mul_ps(xmm1, xmm1);
    
      32766
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      32766
              xmm1 = _mm_hadd_ps(xmm1, xmm2);
    
      32766
              xmm3 = _mm_min_ps(xmm1, xmm3);
    
      32766
              xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
    
      32766
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      32766
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      65532
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      32766
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      65532
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_points >> 1 & 1) {
    
      2
              xmm2 = _mm_load_ps((float*)source);
    
      2
              xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
    
      2
              xmm8 = bit128_p(&xmm1)->int_vec;
    
      2
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      2
              source += 2;
    
      2
              xmm1 = _mm_hadd_ps(xmm2, xmm2);
    
      4
              xmm3 = _mm_min_ps(xmm1, xmm3);
    
      2
              xmm10 = _mm_setr_epi32(2, 2, 2, 2);
    
      2
              xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      4
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_points & 1) {
    
      2
              sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
    
      2
                        lv_cimag(source[0]) * lv_cimag(source[0]);
    
      2
              xmm2 = _mm_load1_ps(&sq_dist);
    
      2
              xmm1 = xmm3;
    
      2
              xmm3 = _mm_min_ss(xmm3, xmm2);
    
      2
              xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
          }
    
          _mm_store_ps((float*)&(holderf.f), xmm3);
    
          _mm_store_si128(&(holderi.int_vec), xmm9);
    
      2
          target[0] = holderi.i[0];
    
      2
          sq_dist = holderf.f[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
    
      2
      }
    
      #endif /*LV_HAVE_SSE3*/
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
    
                                                         const lv_32fc_t* source,
    
                                                         uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          float sq_dist = 0.0;
    
      2
          float min = FLT_MAX;
    
      2
          uint16_t index = 0;
    
        2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.

      131072
          for (uint32_t i = 0; i < num_points; ++i) {
    
      131070
              sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
    
      131070
                        lv_cimag(source[i]) * lv_cimag(source[i]);
    
        2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 131052 times.

      131070
              if (sq_dist < min) {
    
      18
                  index = i;
    
      18
                  min = sq_dist;
    
              }
    
          }
    
      2
          target[0] = index;
    
      2
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
    
      #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
    
      #define INCLUDED_volk_32fc_index_min_16u_u_H
    
      #include <inttypes.h>
    
      #include <limits.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_loadu_ps((float*)source);
    
      16382
              __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
    
      16382
              vector_32fc_index_min_variant0(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      16382
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (min_values_buffer[i] < min) {
    
      4
                  min = min_values_buffer[i];
    
      4
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      16382
              __m256 in0 = _mm256_loadu_ps((float*)source);
    
      16382
              __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
    
      16382
              vector_32fc_index_min_variant1(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      16382
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (min_values_buffer[i] < min) {
    
      4
                  min = min_values_buffer[i];
    
      4
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2021 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_index_min_16u
12			*
13			* \b Overview
14			*
15			* Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
16			* minimum magnitude for complex points in the given vector.
17			*
18			* Note that num_points is a uint32_t, but the return value is
19			* uint16_t. Providing a vector larger than the max of a uint16_t
20			* (65536) would miss anything outside of this boundary. The kernel
21			* will check the length of num_points and cap it to this max value,
22			* anyways.
23			*
24			* <b>Dispatcher Prototype</b>
25			* \code
26			* void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t
27			* num_points) \endcode
28			*
29			* \b Inputs
30			* \li source: The complex input vector.
31			* \li num_points: The number of samples.
32			*
33			* \b Outputs
34			* \li target: The index of the point with minimum magnitude.
35			*
36			* \b Example
37			* Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
38			* the unit circle.
39			* \code
40			* int N = 10;
41			* uint32_t alignment = volk_get_alignment();
42			* lv_32fc_t* in = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
43			* uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
44			*
45			* for(uint32_t ii = 0; ii < N/2; ++ii){
46			* float real = 2.f * ((float)ii / (float)N) - 1.f;
47			* float imag = std::sqrt(1.f - real * real);
48			* in[ii] = lv_cmake(real, imag);
49			* in[ii] = in[ii] * in[ii] + in[ii];
50			* in[N-ii] = lv_cmake(real, imag);
51			* in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
52			* }
53			*
54			* volk_32fc_index_min_16u(min, in, N);
55			*
56			* printf("index of min value = %u\n", *min);
57			*
58			* volk_free(in);
59			* volk_free(min);
60			* \endcode
61			*/
62
63			#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64			#define INCLUDED_volk_32fc_index_min_16u_a_H
65
66			#include <inttypes.h>
67			#include <limits.h>
68			#include <stdio.h>
69			#include <volk/volk_common.h>
70			#include <volk/volk_complex.h>
71
72			#ifdef LV_HAVE_AVX2
73			#include <immintrin.h>
74			#include <volk/volk_avx2_intrinsics.h>
75
76		2	static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
77			const lv_32fc_t* source,
78			uint32_t num_points)
79			{
80		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82		2	const __m256i indices_increment = _mm256_set1_epi32(8);
83			/*
84			* At the start of each loop iteration current_indices holds the indices of
85			* the complex numbers loaded from memory. Explanation for odd order is given
86			* in implementation of vector_32fc_index_min_variant0().
87			*/
88		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
91		2	__m256i min_indices = _mm256_setzero_si256();
92
93	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
94		16382	__m256 in0 = _mm256_load_ps((float*)source);
95		16382	__m256 in1 = _mm256_load_ps((float*)(source + 4));
96		16382	vector_32fc_index_min_variant0(
97			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
98		16382	source += 8;
99			}
100
101			// determine minimum value and index in the result of the vectorized loop
102			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
103			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
104		2	_mm256_store_ps(min_values_buffer, min_values);
105		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
106
107		2	float min = FLT_MAX;
108		2	uint32_t index = 0;
109	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
110	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (min_values_buffer[i] < min) {
111		4	min = min_values_buffer[i];
112		4	index = min_indices_buffer[i];
113			}
114			}
115
116			// handle tail not processed by the vectorized loop
117	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118		14	const float abs_squared =
119		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
120	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
121		✗	min = abs_squared;
122		✗	index = i;
123			}
124		14	++source;
125			}
126
127		2	*target = index;
128		2	}
129
130			#endif /LV_HAVE_AVX2/
131
132			#ifdef LV_HAVE_AVX2
133			#include <immintrin.h>
134			#include <volk/volk_avx2_intrinsics.h>
135
136		2	static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
137			const lv_32fc_t* source,
138			uint32_t num_points)
139			{
140		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142		2	const __m256i indices_increment = _mm256_set1_epi32(8);
143			/*
144			* At the start of each loop iteration current_indices holds the indices of
145			* the complex numbers loaded from memory. Explanation for odd order is given
146			* in implementation of vector_32fc_index_min_variant0().
147			*/
148		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
151		2	__m256i min_indices = _mm256_setzero_si256();
152
153	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
154		16382	__m256 in0 = _mm256_load_ps((float*)source);
155		16382	__m256 in1 = _mm256_load_ps((float*)(source + 4));
156		16382	vector_32fc_index_min_variant1(
157			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
158		16382	source += 8;
159			}
160
161			// determine minimum value and index in the result of the vectorized loop
162			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
163			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
164		2	_mm256_store_ps(min_values_buffer, min_values);
165		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
166
167		2	float min = FLT_MAX;
168		2	uint32_t index = 0;
169	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
170	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (min_values_buffer[i] < min) {
171		4	min = min_values_buffer[i];
172		4	index = min_indices_buffer[i];
173			}
174			}
175
176			// handle tail not processed by the vectorized loop
177	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178		14	const float abs_squared =
179		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
180	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
181		✗	min = abs_squared;
182		✗	index = i;
183			}
184		14	++source;
185			}
186
187		2	*target = index;
188		2	}
189
190			#endif /LV_HAVE_AVX2/
191
192			#ifdef LV_HAVE_SSE3
193			#include <pmmintrin.h>
194			#include <xmmintrin.h>
195
196		2	static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
197			const lv_32fc_t* source,
198			uint32_t num_points)
199			{
200		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201
202			union bit128 holderf;
203			union bit128 holderi;
204		2	float sq_dist = 0.0;
205
206			union bit128 xmm5, xmm4;
207			__m128 xmm1, xmm2, xmm3;
208			__m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210		2	xmm5.int_vec = _mm_setzero_si128();
211		2	xmm4.int_vec = _mm_setzero_si128();
212		2	holderf.int_vec = _mm_setzero_si128();
213		2	holderi.int_vec = _mm_setzero_si128();
214
215		2	xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216		2	xmm9 = _mm_setzero_si128();
217		2	xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218		2	xmm3 = _mm_set_ps1(FLT_MAX);
219
220		2	int bound = num_points >> 2;
221
222	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (int i = 0; i < bound; ++i) {
223		32766	xmm1 = _mm_load_ps((float*)source);
224		32766	xmm2 = _mm_load_ps((float*)&source[2]);
225
226		32766	source += 4;
227
228		65532	xmm1 = _mm_mul_ps(xmm1, xmm1);
229		32766	xmm2 = _mm_mul_ps(xmm2, xmm2);
230
231		32766	xmm1 = _mm_hadd_ps(xmm1, xmm2);
232
233		32766	xmm3 = _mm_min_ps(xmm1, xmm3);
234
235		32766	xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236		32766	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237
238		32766	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239		65532	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240
241		32766	xmm9 = _mm_add_epi32(xmm11, xmm12);
242
243		65532	xmm8 = _mm_add_epi32(xmm8, xmm10);
244			}
245
246	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_points >> 1 & 1) {
247		2	xmm2 = _mm_load_ps((float*)source);
248
249		2	xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250		2	xmm8 = bit128_p(&xmm1)->int_vec;
251
252		2	xmm2 = _mm_mul_ps(xmm2, xmm2);
253
254		2	source += 2;
255
256		2	xmm1 = _mm_hadd_ps(xmm2, xmm2);
257
258		4	xmm3 = _mm_min_ps(xmm1, xmm3);
259
260		2	xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261
262		2	xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264
265		2	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266		4	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267
268		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
269
270		4	xmm8 = _mm_add_epi32(xmm8, xmm10);
271			}
272
273	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_points & 1) {
274		2	sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275		2	lv_cimag(source[0]) * lv_cimag(source[0]);
276
277		2	xmm2 = _mm_load1_ps(&sq_dist);
278
279		2	xmm1 = xmm3;
280
281		2	xmm3 = _mm_min_ss(xmm3, xmm2);
282
283		2	xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285
286		2	xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287
288		2	xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289		4	xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290
291		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
292			}
293
294			_mm_store_ps((float*)&(holderf.f), xmm3);
295			_mm_store_si128(&(holderi.int_vec), xmm9);
296
297		2	target[0] = holderi.i[0];
298		2	sq_dist = holderf.f[0];
299	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305		2	}
306
307			#endif /LV_HAVE_SSE3/
308
309			#ifdef LV_HAVE_GENERIC
310		2	static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
311			const lv_32fc_t* source,
312			uint32_t num_points)
313			{
314		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315
316		2	float sq_dist = 0.0;
317		2	float min = FLT_MAX;
318		2	uint16_t index = 0;
319
320	2/2 ✓ Branch 0 taken 131070 times. ✓ Branch 1 taken 2 times.	131072	for (uint32_t i = 0; i < num_points; ++i) {
321		131070	sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
322		131070	lv_cimag(source[i]) * lv_cimag(source[i]);
323
324	2/2 ✓ Branch 0 taken 18 times. ✓ Branch 1 taken 131052 times.	131070	if (sq_dist < min) {
325		18	index = i;
326		18	min = sq_dist;
327			}
328			}
329		2	target[0] = index;
330		2	}
331
332			#endif /LV_HAVE_GENERIC/
333
334			#endif /INCLUDED_volk_32fc_index_min_16u_a_H/
335
336			#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
337			#define INCLUDED_volk_32fc_index_min_16u_u_H
338
339			#include <inttypes.h>
340			#include <limits.h>
341			#include <stdio.h>
342			#include <volk/volk_common.h>
343			#include <volk/volk_complex.h>
344
345			#ifdef LV_HAVE_AVX2
346			#include <immintrin.h>
347			#include <volk/volk_avx2_intrinsics.h>
348
349		2	static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
350			const lv_32fc_t* source,
351			uint32_t num_points)
352			{
353		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
354
355		2	const __m256i indices_increment = _mm256_set1_epi32(8);
356			/*
357			* At the start of each loop iteration current_indices holds the indices of
358			* the complex numbers loaded from memory. Explanation for odd order is given
359			* in implementation of vector_32fc_index_min_variant0().
360			*/
361		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
362
363		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
364		2	__m256i min_indices = _mm256_setzero_si256();
365
366	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
367		16382	__m256 in0 = _mm256_loadu_ps((float*)source);
368		16382	__m256 in1 = _mm256_loadu_ps((float*)(source + 4));
369		16382	vector_32fc_index_min_variant0(
370			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
371		16382	source += 8;
372			}
373
374			// determine minimum value and index in the result of the vectorized loop
375			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
376			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
377		2	_mm256_store_ps(min_values_buffer, min_values);
378		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
379
380		2	float min = FLT_MAX;
381		2	uint32_t index = 0;
382	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
383	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (min_values_buffer[i] < min) {
384		4	min = min_values_buffer[i];
385		4	index = min_indices_buffer[i];
386			}
387			}
388
389			// handle tail not processed by the vectorized loop
390	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
391		14	const float abs_squared =
392		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
393	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
394		✗	min = abs_squared;
395		✗	index = i;
396			}
397		14	++source;
398			}
399
400		2	*target = index;
401		2	}
402
403			#endif /LV_HAVE_AVX2/
404
405			#ifdef LV_HAVE_AVX2
406			#include <immintrin.h>
407			#include <volk/volk_avx2_intrinsics.h>
408
409		2	static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
410			const lv_32fc_t* source,
411			uint32_t num_points)
412			{
413		2	num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
414
415		2	const __m256i indices_increment = _mm256_set1_epi32(8);
416			/*
417			* At the start of each loop iteration current_indices holds the indices of
418			* the complex numbers loaded from memory. Explanation for odd order is given
419			* in implementation of vector_32fc_index_min_variant0().
420			*/
421		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
422
423		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
424		2	__m256i min_indices = _mm256_setzero_si256();
425
426	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned i = 0; i < num_points / 8u; ++i) {
427		16382	__m256 in0 = _mm256_loadu_ps((float*)source);
428		16382	__m256 in1 = _mm256_loadu_ps((float*)(source + 4));
429		16382	vector_32fc_index_min_variant1(
430			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
431		16382	source += 8;
432			}
433
434			// determine minimum value and index in the result of the vectorized loop
435			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
436			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
437		2	_mm256_store_ps(min_values_buffer, min_values);
438		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
439
440		2	float min = FLT_MAX;
441		2	uint32_t index = 0;
442	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
443	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (min_values_buffer[i] < min) {
444		4	min = min_values_buffer[i];
445		4	index = min_indices_buffer[i];
446			}
447			}
448
449			// handle tail not processed by the vectorized loop
450	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
451		14	const float abs_squared =
452		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
453	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
454		✗	min = abs_squared;
455		✗	index = i;
456			}
457		14	++source;
458			}
459
460		2	*target = index;
461		2	}
462
463			#endif /LV_HAVE_AVX2/
464
465			#endif /INCLUDED_volk_32fc_index_min_16u_u_H/
466