GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_index_max_32u.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	177	185	95.7%
Functions:	6	6	100.0%
Branches:	54	62	87.1%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_index_max_32u
    
       *
    
       * \b Overview
    
       *
    
       * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
    
       * maximum magnitude for complex points in the given vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
    
       * num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li src0: The complex input vector.
    
       * \li num_points: The number of samples.
    
       *
    
       * \b Outputs
    
       * \li target: The index of the point with maximum magnitude.
    
       *
    
       * \b Example
    
       * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
    
       * the unit circle.
    
       * \code
    
       *   int N = 10;
    
       *   uint32_t alignment = volk_get_alignment();
    
       *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
    
       *
    
       *   for(uint32_t ii = 0; ii < N/2; ++ii){
    
       *       float real = 2.f * ((float)ii / (float)N) - 1.f;
    
       *       float imag = std::sqrt(1.f - real * real);
    
       *       in[ii] = lv_cmake(real, imag);
    
       *       in[ii] = in[ii] * in[ii] + in[ii];
    
       *       in[N-ii] = lv_cmake(real, imag);
    
       *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
    
       *   }
    
       *
    
       *   volk_32fc_index_max_32u(max, in, N);
    
       *
    
       *   printf("index of max value = %u\n",  *max);
    
       *
    
       *   volk_free(in);
    
       *   volk_free(max);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
    
      #define INCLUDED_volk_32fc_index_max_32u_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_load_ps((float*)src0);
    
      32766
              __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
    
      32766
              vector_32fc_index_max_variant0(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      32766
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (max_values_buffer[i] > max) {
    
      4
                  max = max_values_buffer[i];
    
      4
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_load_ps((float*)src0);
    
      32766
              __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
    
      32766
              vector_32fc_index_max_variant1(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      32766
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (max_values_buffer[i] > max) {
    
      4
                  max = max_values_buffer[i];
    
      4
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <xmmintrin.h>
    
      static inline void
    
      2
      volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
    
      {
    
      2
          const uint32_t num_bytes = num_points * 8;
    
          union bit128 holderf;
    
          union bit128 holderi;
    
      2
          float sq_dist = 0.0;
    
          union bit128 xmm5, xmm4;
    
          __m128 xmm1, xmm2, xmm3;
    
          __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
    
      2
          xmm5.int_vec = _mm_setzero_si128();
    
      2
          xmm4.int_vec = _mm_setzero_si128();
    
      2
          holderf.int_vec = _mm_setzero_si128();
    
      2
          holderi.int_vec = _mm_setzero_si128();
    
      2
          int bound = num_bytes >> 5;
    
      2
          int i = 0;
    
      2
          xmm8 = _mm_setr_epi32(0, 1, 2, 3);
    
      2
          xmm9 = _mm_setzero_si128();
    
      2
          xmm10 = _mm_setr_epi32(4, 4, 4, 4);
    
      2
          xmm3 = _mm_setzero_ps();
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; i < bound; ++i) {
    
      65534
              xmm1 = _mm_load_ps((float*)src0);
    
      65534
              xmm2 = _mm_load_ps((float*)&src0[2]);
    
      65534
              src0 += 4;
    
      131068
              xmm1 = _mm_mul_ps(xmm1, xmm1);
    
      65534
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      65534
              xmm1 = _mm_hadd_ps(xmm1, xmm2);
    
      65534
              xmm3 = _mm_max_ps(xmm1, xmm3);
    
      65534
              xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
    
      65534
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      65534
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      131068
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      65534
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      131068
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_bytes >> 4 & 1) {
    
      2
              xmm2 = _mm_load_ps((float*)src0);
    
      2
              xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
    
      2
              xmm8 = bit128_p(&xmm1)->int_vec;
    
      2
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      2
              src0 += 2;
    
      2
              xmm1 = _mm_hadd_ps(xmm2, xmm2);
    
      4
              xmm3 = _mm_max_ps(xmm1, xmm3);
    
      2
              xmm10 = _mm_setr_epi32(2, 2, 2, 2);
    
      2
              xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      4
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_bytes >> 3 & 1) {
    
      2
              sq_dist =
    
      2
                  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
    
      2
              xmm2 = _mm_load1_ps(&sq_dist);
    
      2
              xmm1 = xmm3;
    
      2
              xmm3 = _mm_max_ss(xmm3, xmm2);
    
      2
              xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
          }
    
          _mm_store_ps((float*)&(holderf.f), xmm3);
    
          _mm_store_si128(&(holderi.int_vec), xmm9);
    
      2
          target[0] = holderi.i[0];
    
      2
          sq_dist = holderf.f[0];
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.

      2
          sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
    
      2
      }
    
      #endif /*LV_HAVE_SSE3*/
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
    
      {
    
      2
          const uint32_t num_bytes = num_points * 8;
    
      2
          float sq_dist = 0.0;
    
      2
          float max = 0.0;
    
      2
          uint32_t index = 0;
    
      2
          uint32_t i = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; i<num_bytes>> 3; ++i) {
    
      262142
              sq_dist =
    
      262142
                  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
    
        2/2✓ Branch 0 taken 23 times.
✓ Branch 1 taken 262119 times.

      262142
              if (sq_dist > max) {
    
      23
                  index = i;
    
      23
                  max = sq_dist;
    
              }
    
          }
    
      2
          target[0] = index;
    
      2
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
    
      #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
    
      #define INCLUDED_volk_32fc_index_max_32u_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_loadu_ps((float*)src0);
    
      32766
              __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
    
      32766
              vector_32fc_index_max_variant0(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      32766
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (max_values_buffer[i] > max) {
    
      4
                  max = max_values_buffer[i];
    
      4
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
    
                                                                  lv_32fc_t* src0,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_max_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 max_values = _mm256_setzero_ps();
    
      2
          __m256i max_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_loadu_ps((float*)src0);
    
      32766
              __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
    
      32766
              vector_32fc_index_max_variant1(
    
                  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
    
      32766
              src0 += 8;
    
          }
    
          // determine maximum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
    
      2
          _mm256_store_ps(max_values_buffer, max_values);
    
      2
          _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
    
      2
          float max = 0.f;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12 times.

      16
              if (max_values_buffer[i] > max) {
    
      4
                  max = max_values_buffer[i];
    
      4
                  index = max_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared > max) {
    
      ✗
                  max = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++src0;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      #include <volk/volk_neon_intrinsics.h>
    
      static inline void
    
      volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
    
      {
    
          unsigned int number = 0;
    
          const uint32_t quarter_points = num_points / 4;
    
          const lv_32fc_t* src0Ptr = src0;
    
          uint32_t indices[4] = { 0, 1, 2, 3 };
    
          const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
    
          uint32x4_t vec_indices = vld1q_u32(indices);
    
          uint32x4_t vec_max_indices = vec_indices;
    
          if (num_points) {
    
              float max = FLT_MIN;
    
              uint32_t index = 0;
    
              float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
    
              for (; number < quarter_points; number++) {
    
                  // Load complex and compute magnitude squared
    
                  const float32x4_t vec_mag2 =
    
                      _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
    
                  __VOLK_PREFETCH(src0Ptr += 4);
    
                  // a > b?
    
                  const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
    
                  vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
    
                  vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
    
                  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
    
              }
    
              uint32_t tmp_max_indices[4];
    
              float tmp_max[4];
    
              vst1q_u32(tmp_max_indices, vec_max_indices);
    
              vst1q_f32(tmp_max, vec_max);
    
              for (int i = 0; i < 4; i++) {
    
                  if (tmp_max[i] > max) {
    
                      max = tmp_max[i];
    
                      index = tmp_max_indices[i];
    
                  }
    
              }
    
              // Deal with the rest
    
              for (number = quarter_points * 4; number < num_points; number++) {
    
                  const float re = lv_creal(*src0Ptr);
    
                  const float im = lv_cimag(*src0Ptr);
    
                  const float sq_dist = re * re + im * im;
    
                  if (sq_dist > max) {
    
                      max = sq_dist;
    
                      index = number;
    
                  }
    
                  src0Ptr++;
    
              }
    
              *target = index;
    
          }
    
      }
    
      #endif /*LV_HAVE_NEON*/
    
      #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_index_max_32u
12			*
13			* \b Overview
14			*
15			* Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
16			* maximum magnitude for complex points in the given vector.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
21			* num_points) \endcode
22			*
23			* \b Inputs
24			* \li src0: The complex input vector.
25			* \li num_points: The number of samples.
26			*
27			* \b Outputs
28			* \li target: The index of the point with maximum magnitude.
29			*
30			* \b Example
31			* Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
32			* the unit circle.
33			* \code
34			* int N = 10;
35			* uint32_t alignment = volk_get_alignment();
36			* lv_32fc_t* in = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
37			* uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
38			*
39			* for(uint32_t ii = 0; ii < N/2; ++ii){
40			* float real = 2.f * ((float)ii / (float)N) - 1.f;
41			* float imag = std::sqrt(1.f - real * real);
42			* in[ii] = lv_cmake(real, imag);
43			* in[ii] = in[ii] * in[ii] + in[ii];
44			* in[N-ii] = lv_cmake(real, imag);
45			* in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
46			* }
47			*
48			* volk_32fc_index_max_32u(max, in, N);
49			*
50			* printf("index of max value = %u\n", *max);
51			*
52			* volk_free(in);
53			* volk_free(max);
54			* \endcode
55			*/
56
57			#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58			#define INCLUDED_volk_32fc_index_max_32u_a_H
59
60			#include <inttypes.h>
61			#include <stdio.h>
62			#include <volk/volk_common.h>
63			#include <volk/volk_complex.h>
64
65			#ifdef LV_HAVE_AVX2
66			#include <immintrin.h>
67			#include <volk/volk_avx2_intrinsics.h>
68
69		2	static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
70			lv_32fc_t* src0,
71			uint32_t num_points)
72			{
73		2	const __m256i indices_increment = _mm256_set1_epi32(8);
74			/*
75			* At the start of each loop iteration current_indices holds the indices of
76			* the complex numbers loaded from memory. Explanation for odd order is given
77			* in implementation of vector_32fc_index_max_variant0().
78			*/
79		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80
81		2	__m256 max_values = _mm256_setzero_ps();
82		2	__m256i max_indices = _mm256_setzero_si256();
83
84	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
85		32766	__m256 in0 = _mm256_load_ps((float*)src0);
86		32766	__m256 in1 = _mm256_load_ps((float*)(src0 + 4));
87		32766	vector_32fc_index_max_variant0(
88			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
89		32766	src0 += 8;
90			}
91
92			// determine maximum value and index in the result of the vectorized loop
93			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
94			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
95		2	_mm256_store_ps(max_values_buffer, max_values);
96		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
97
98		2	float max = 0.f;
99		2	uint32_t index = 0;
100	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
101	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (max_values_buffer[i] > max) {
102		4	max = max_values_buffer[i];
103		4	index = max_indices_buffer[i];
104			}
105			}
106
107			// handle tail not processed by the vectorized loop
108	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109		14	const float abs_squared =
110		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
111	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
112		✗	max = abs_squared;
113		✗	index = i;
114			}
115		14	++src0;
116			}
117
118		2	*target = index;
119		2	}
120
121			#endif /LV_HAVE_AVX2/
122
123			#ifdef LV_HAVE_AVX2
124			#include <immintrin.h>
125			#include <volk/volk_avx2_intrinsics.h>
126
127		2	static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
128			lv_32fc_t* src0,
129			uint32_t num_points)
130			{
131		2	const __m256i indices_increment = _mm256_set1_epi32(8);
132			/*
133			* At the start of each loop iteration current_indices holds the indices of
134			* the complex numbers loaded from memory. Explanation for odd order is given
135			* in implementation of vector_32fc_index_max_variant0().
136			*/
137		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138
139		2	__m256 max_values = _mm256_setzero_ps();
140		2	__m256i max_indices = _mm256_setzero_si256();
141
142	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
143		32766	__m256 in0 = _mm256_load_ps((float*)src0);
144		32766	__m256 in1 = _mm256_load_ps((float*)(src0 + 4));
145		32766	vector_32fc_index_max_variant1(
146			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
147		32766	src0 += 8;
148			}
149
150			// determine maximum value and index in the result of the vectorized loop
151			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
152			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
153		2	_mm256_store_ps(max_values_buffer, max_values);
154		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
155
156		2	float max = 0.f;
157		2	uint32_t index = 0;
158	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
159	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (max_values_buffer[i] > max) {
160		4	max = max_values_buffer[i];
161		4	index = max_indices_buffer[i];
162			}
163			}
164
165			// handle tail not processed by the vectorized loop
166	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167		14	const float abs_squared =
168		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
169	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
170		✗	max = abs_squared;
171		✗	index = i;
172			}
173		14	++src0;
174			}
175
176		2	*target = index;
177		2	}
178
179			#endif /LV_HAVE_AVX2/
180
181			#ifdef LV_HAVE_SSE3
182			#include <pmmintrin.h>
183			#include <xmmintrin.h>
184
185			static inline void
186		2	volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
187			{
188		2	const uint32_t num_bytes = num_points * 8;
189
190			union bit128 holderf;
191			union bit128 holderi;
192		2	float sq_dist = 0.0;
193
194			union bit128 xmm5, xmm4;
195			__m128 xmm1, xmm2, xmm3;
196			__m128i xmm8, xmm11, xmm12, xmm9, xmm10;
197
198		2	xmm5.int_vec = _mm_setzero_si128();
199		2	xmm4.int_vec = _mm_setzero_si128();
200		2	holderf.int_vec = _mm_setzero_si128();
201		2	holderi.int_vec = _mm_setzero_si128();
202
203		2	int bound = num_bytes >> 5;
204		2	int i = 0;
205
206		2	xmm8 = _mm_setr_epi32(0, 1, 2, 3);
207		2	xmm9 = _mm_setzero_si128();
208		2	xmm10 = _mm_setr_epi32(4, 4, 4, 4);
209		2	xmm3 = _mm_setzero_ps();
210
211	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; i < bound; ++i) {
212		65534	xmm1 = _mm_load_ps((float*)src0);
213		65534	xmm2 = _mm_load_ps((float*)&src0[2]);
214
215		65534	src0 += 4;
216
217		131068	xmm1 = _mm_mul_ps(xmm1, xmm1);
218		65534	xmm2 = _mm_mul_ps(xmm2, xmm2);
219
220		65534	xmm1 = _mm_hadd_ps(xmm1, xmm2);
221
222		65534	xmm3 = _mm_max_ps(xmm1, xmm3);
223
224		65534	xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
225		65534	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
226
227		65534	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
228		131068	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
229
230		65534	xmm9 = _mm_add_epi32(xmm11, xmm12);
231
232		131068	xmm8 = _mm_add_epi32(xmm8, xmm10);
233			}
234
235	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_bytes >> 4 & 1) {
236		2	xmm2 = _mm_load_ps((float*)src0);
237
238		2	xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
239		2	xmm8 = bit128_p(&xmm1)->int_vec;
240
241		2	xmm2 = _mm_mul_ps(xmm2, xmm2);
242
243		2	src0 += 2;
244
245		2	xmm1 = _mm_hadd_ps(xmm2, xmm2);
246
247		4	xmm3 = _mm_max_ps(xmm1, xmm3);
248
249		2	xmm10 = _mm_setr_epi32(2, 2, 2, 2);
250
251		2	xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
252		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
253
254		2	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
255		4	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
256
257		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
258
259		4	xmm8 = _mm_add_epi32(xmm8, xmm10);
260			}
261
262	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_bytes >> 3 & 1) {
263		2	sq_dist =
264		2	lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
265
266		2	xmm2 = _mm_load1_ps(&sq_dist);
267
268		2	xmm1 = xmm3;
269
270		2	xmm3 = _mm_max_ss(xmm3, xmm2);
271
272		2	xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
273		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
274
275		2	xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
276
277		2	xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
278		4	xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
279
280		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
281			}
282
283			_mm_store_ps((float*)&(holderf.f), xmm3);
284			_mm_store_si128(&(holderi.int_vec), xmm9);
285
286		2	target[0] = holderi.i[0];
287		2	sq_dist = holderf.f[0];
288	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
289	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 2 times.	2	sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
290	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
291	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
292	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
293	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
294		2	}
295
296			#endif /LV_HAVE_SSE3/
297
298			#ifdef LV_HAVE_GENERIC
299			static inline void
300		2	volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
301			{
302		2	const uint32_t num_bytes = num_points * 8;
303
304		2	float sq_dist = 0.0;
305		2	float max = 0.0;
306		2	uint32_t index = 0;
307
308		2	uint32_t i = 0;
309
310	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; i<num_bytes>> 3; ++i) {
311		262142	sq_dist =
312		262142	lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
313
314	2/2 ✓ Branch 0 taken 23 times. ✓ Branch 1 taken 262119 times.	262142	if (sq_dist > max) {
315		23	index = i;
316		23	max = sq_dist;
317			}
318			}
319		2	target[0] = index;
320		2	}
321
322			#endif /LV_HAVE_GENERIC/
323
324			#endif /INCLUDED_volk_32fc_index_max_32u_a_H/
325
326			#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
327			#define INCLUDED_volk_32fc_index_max_32u_u_H
328
329			#include <inttypes.h>
330			#include <stdio.h>
331			#include <volk/volk_common.h>
332			#include <volk/volk_complex.h>
333
334			#ifdef LV_HAVE_AVX2
335			#include <immintrin.h>
336			#include <volk/volk_avx2_intrinsics.h>
337
338		2	static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
339			lv_32fc_t* src0,
340			uint32_t num_points)
341			{
342		2	const __m256i indices_increment = _mm256_set1_epi32(8);
343			/*
344			* At the start of each loop iteration current_indices holds the indices of
345			* the complex numbers loaded from memory. Explanation for odd order is given
346			* in implementation of vector_32fc_index_max_variant0().
347			*/
348		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
349
350		2	__m256 max_values = _mm256_setzero_ps();
351		2	__m256i max_indices = _mm256_setzero_si256();
352
353	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
354		32766	__m256 in0 = _mm256_loadu_ps((float*)src0);
355		32766	__m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
356		32766	vector_32fc_index_max_variant0(
357			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
358		32766	src0 += 8;
359			}
360
361			// determine maximum value and index in the result of the vectorized loop
362			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
363			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
364		2	_mm256_store_ps(max_values_buffer, max_values);
365		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
366
367		2	float max = 0.f;
368		2	uint32_t index = 0;
369	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
370	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (max_values_buffer[i] > max) {
371		4	max = max_values_buffer[i];
372		4	index = max_indices_buffer[i];
373			}
374			}
375
376			// handle tail not processed by the vectorized loop
377	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
378		14	const float abs_squared =
379		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
380	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
381		✗	max = abs_squared;
382		✗	index = i;
383			}
384		14	++src0;
385			}
386
387		2	*target = index;
388		2	}
389
390			#endif /LV_HAVE_AVX2/
391
392			#ifdef LV_HAVE_AVX2
393			#include <immintrin.h>
394			#include <volk/volk_avx2_intrinsics.h>
395
396		2	static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
397			lv_32fc_t* src0,
398			uint32_t num_points)
399			{
400		2	const __m256i indices_increment = _mm256_set1_epi32(8);
401			/*
402			* At the start of each loop iteration current_indices holds the indices of
403			* the complex numbers loaded from memory. Explanation for odd order is given
404			* in implementation of vector_32fc_index_max_variant0().
405			*/
406		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
407
408		2	__m256 max_values = _mm256_setzero_ps();
409		2	__m256i max_indices = _mm256_setzero_si256();
410
411	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
412		32766	__m256 in0 = _mm256_loadu_ps((float*)src0);
413		32766	__m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
414		32766	vector_32fc_index_max_variant1(
415			in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
416		32766	src0 += 8;
417			}
418
419			// determine maximum value and index in the result of the vectorized loop
420			__VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
421			__VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
422		2	_mm256_store_ps(max_values_buffer, max_values);
423		2	_mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
424
425		2	float max = 0.f;
426		2	uint32_t index = 0;
427	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
428	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 12 times.	16	if (max_values_buffer[i] > max) {
429		4	max = max_values_buffer[i];
430		4	index = max_indices_buffer[i];
431			}
432			}
433
434			// handle tail not processed by the vectorized loop
435	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
436		14	const float abs_squared =
437		14	lv_creal(src0) lv_creal(src0) + lv_cimag(src0) * lv_cimag(*src0);
438	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared > max) {
439		✗	max = abs_squared;
440		✗	index = i;
441			}
442		14	++src0;
443			}
444
445		2	*target = index;
446		2	}
447
448			#endif /LV_HAVE_AVX2/
449
450			#ifdef LV_HAVE_NEON
451			#include <arm_neon.h>
452			#include <volk/volk_neon_intrinsics.h>
453
454			static inline void
455			volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
456			{
457			unsigned int number = 0;
458			const uint32_t quarter_points = num_points / 4;
459			const lv_32fc_t* src0Ptr = src0;
460
461			uint32_t indices[4] = { 0, 1, 2, 3 };
462			const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
463			uint32x4_t vec_indices = vld1q_u32(indices);
464			uint32x4_t vec_max_indices = vec_indices;
465
466			if (num_points) {
467			float max = FLT_MIN;
468			uint32_t index = 0;
469
470			float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
471
472			for (; number < quarter_points; number++) {
473			// Load complex and compute magnitude squared
474			const float32x4_t vec_mag2 =
475			_vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
476			__VOLK_PREFETCH(src0Ptr += 4);
477			// a > b?
478			const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
479			vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
480			vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
481			vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
482			}
483			uint32_t tmp_max_indices[4];
484			float tmp_max[4];
485			vst1q_u32(tmp_max_indices, vec_max_indices);
486			vst1q_f32(tmp_max, vec_max);
487
488			for (int i = 0; i < 4; i++) {
489			if (tmp_max[i] > max) {
490			max = tmp_max[i];
491			index = tmp_max_indices[i];
492			}
493			}
494
495			// Deal with the rest
496			for (number = quarter_points * 4; number < num_points; number++) {
497			const float re = lv_creal(*src0Ptr);
498			const float im = lv_cimag(*src0Ptr);
499			const float sq_dist = re * re + im * im;
500			if (sq_dist > max) {
501			max = sq_dist;
502			index = number;
503			}
504			src0Ptr++;
505			}
506			*target = index;
507			}
508			}
509
510			#endif /LV_HAVE_NEON/
511
512			#endif /INCLUDED_volk_32fc_index_max_32u_u_H/
513