GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_index_min_32u.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	173	181	95.6%
Functions:	6	6	100.0%
Branches:	56	62	90.3%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2021 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_index_min_32u
    
       *
    
       * \b Overview
    
       *
    
       * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
    
       * minimum magnitude for complex points in the given vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t
    
       * num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li source: The complex input vector.
    
       * \li num_points: The number of samples.
    
       *
    
       * \b Outputs
    
       * \li target: The index of the point with minimum magnitude.
    
       *
    
       * \b Example
    
       * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
    
       * the unit circle.
    
       * \code
    
       *   int N = 10;
    
       *   uint32_t alignment = volk_get_alignment();
    
       *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   uint32_t* min = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
    
       *
    
       *   for(uint32_t ii = 0; ii < N/2; ++ii){
    
       *       float real = 2.f * ((float)ii / (float)N) - 1.f;
    
       *       float imag = std::sqrt(1.f - real * real);
    
       *       in[ii] = lv_cmake(real, imag);
    
       *       in[ii] = in[ii] * in[ii] + in[ii];
    
       *       in[N-ii] = lv_cmake(real, imag);
    
       *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
    
       *   }
    
       *
    
       *   volk_32fc_index_min_32u(min, in, N);
    
       *
    
       *   printf("index of min value = %u\n",  *min);
    
       *
    
       *   volk_free(in);
    
       *   volk_free(min);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_index_min_32u_a_H
    
      #define INCLUDED_volk_32fc_index_min_32u_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_load_ps((float*)source);
    
      32766
              __m256 in1 = _mm256_load_ps((float*)(source + 4));
    
      32766
              vector_32fc_index_min_variant0(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      32766
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 9 times.

      16
              if (min_values_buffer[i] < min) {
    
      7
                  min = min_values_buffer[i];
    
      7
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_load_ps((float*)source);
    
      32766
              __m256 in1 = _mm256_load_ps((float*)(source + 4));
    
      32766
              vector_32fc_index_min_variant1(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      32766
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 9 times.

      16
              if (min_values_buffer[i] < min) {
    
      7
                  min = min_values_buffer[i];
    
      7
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
    
                                                        const lv_32fc_t* source,
    
                                                        uint32_t num_points)
    
      {
    
          union bit128 holderf;
    
          union bit128 holderi;
    
      2
          float sq_dist = 0.0;
    
          union bit128 xmm5, xmm4;
    
          __m128 xmm1, xmm2, xmm3;
    
          __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
    
      2
          xmm5.int_vec = _mm_setzero_si128();
    
      2
          xmm4.int_vec = _mm_setzero_si128();
    
      2
          holderf.int_vec = _mm_setzero_si128();
    
      2
          holderi.int_vec = _mm_setzero_si128();
    
      2
          xmm8 = _mm_setr_epi32(0, 1, 2, 3);
    
      2
          xmm9 = _mm_setzero_si128();
    
      2
          xmm10 = _mm_setr_epi32(4, 4, 4, 4);
    
      2
          xmm3 = _mm_set_ps1(FLT_MAX);
    
      2
          int bound = num_points >> 2;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (int i = 0; i < bound; ++i) {
    
      65534
              xmm1 = _mm_load_ps((float*)source);
    
      65534
              xmm2 = _mm_load_ps((float*)&source[2]);
    
      65534
              source += 4;
    
      131068
              xmm1 = _mm_mul_ps(xmm1, xmm1);
    
      65534
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      65534
              xmm1 = _mm_hadd_ps(xmm1, xmm2);
    
      65534
              xmm3 = _mm_min_ps(xmm1, xmm3);
    
      65534
              xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
    
      65534
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      65534
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      131068
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      65534
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      131068
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_points >> 1 & 1) {
    
      2
              xmm2 = _mm_load_ps((float*)source);
    
      2
              xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
    
      2
              xmm8 = bit128_p(&xmm1)->int_vec;
    
      2
              xmm2 = _mm_mul_ps(xmm2, xmm2);
    
      2
              source += 2;
    
      2
              xmm1 = _mm_hadd_ps(xmm2, xmm2);
    
      4
              xmm3 = _mm_min_ps(xmm1, xmm3);
    
      2
              xmm10 = _mm_setr_epi32(2, 2, 2, 2);
    
      2
              xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
      4
              xmm8 = _mm_add_epi32(xmm8, xmm10);
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_points & 1) {
    
      2
              sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
    
      2
                        lv_cimag(source[0]) * lv_cimag(source[0]);
    
      2
              xmm2 = _mm_load1_ps(&sq_dist);
    
      2
              xmm1 = xmm3;
    
      2
              xmm3 = _mm_min_ss(xmm3, xmm2);
    
      2
              xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
    
      2
              xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
    
      2
              xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
    
      2
              xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
    
      4
              xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
    
      2
              xmm9 = _mm_add_epi32(xmm11, xmm12);
    
          }
    
          _mm_store_ps((float*)&(holderf.f), xmm3);
    
          _mm_store_si128(&(holderi.int_vec), xmm9);
    
      2
          target[0] = holderi.i[0];
    
      2
          sq_dist = holderf.f[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.

      2
          sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
    
      2
      }
    
      #endif /*LV_HAVE_SSE3*/
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
    
                                                         const lv_32fc_t* source,
    
                                                         uint32_t num_points)
    
      {
    
      2
          float sq_dist = 0.0;
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (uint32_t i = 0; i < num_points; ++i) {
    
      262142
              sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
    
      262142
                        lv_cimag(source[i]) * lv_cimag(source[i]);
    
        2/2✓ Branch 0 taken 34 times.
✓ Branch 1 taken 262108 times.

      262142
              if (sq_dist < min) {
    
      34
                  index = i;
    
      34
                  min = sq_dist;
    
              }
    
          }
    
      2
          target[0] = index;
    
      2
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
    
      #ifndef INCLUDED_volk_32fc_index_min_32u_u_H
    
      #define INCLUDED_volk_32fc_index_min_32u_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_loadu_ps((float*)source);
    
      32766
              __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
    
      32766
              vector_32fc_index_min_variant0(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      32766
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 9 times.

      16
              if (min_values_buffer[i] < min) {
    
      7
                  min = min_values_buffer[i];
    
      7
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_intrinsics.h>
    
      2
      static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
    
                                                                  const lv_32fc_t* source,
    
                                                                  uint32_t num_points)
    
      {
    
      2
          const __m256i indices_increment = _mm256_set1_epi32(8);
    
          /*
    
           * At the start of each loop iteration current_indices holds the indices of
    
           * the complex numbers loaded from memory. Explanation for odd order is given
    
           * in implementation of vector_32fc_index_min_variant0().
    
           */
    
      2
          __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      2
          __m256 min_values = _mm256_set1_ps(FLT_MAX);
    
      2
          __m256i min_indices = _mm256_setzero_si256();
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned i = 0; i < num_points / 8u; ++i) {
    
      32766
              __m256 in0 = _mm256_loadu_ps((float*)source);
    
      32766
              __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
    
      32766
              vector_32fc_index_min_variant1(
    
                  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
    
      32766
              source += 8;
    
          }
    
          // determine minimum value and index in the result of the vectorized loop
    
          __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
    
          __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
    
      2
          _mm256_store_ps(min_values_buffer, min_values);
    
      2
          _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
    
      2
          float min = FLT_MAX;
    
      2
          uint32_t index = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
          for (unsigned i = 0; i < 8; i++) {
    
        2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 9 times.

      16
              if (min_values_buffer[i] < min) {
    
      7
                  min = min_values_buffer[i];
    
      7
                  index = min_indices_buffer[i];
    
              }
    
          }
    
          // handle tail not processed by the vectorized loop
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (unsigned i = num_points & (~7u); i < num_points; ++i) {
    
      14
              const float abs_squared =
    
      14
                  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (abs_squared < min) {
    
      ✗
                  min = abs_squared;
    
      ✗
                  index = i;
    
              }
    
      14
              ++source;
    
          }
    
      2
          *target = index;
    
      2
      }
    
      #endif /*LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      #include <volk/volk_neon_intrinsics.h>
    
      static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
    
                                                      const lv_32fc_t* source,
    
                                                      uint32_t num_points)
    
      {
    
          const uint32_t quarter_points = num_points / 4;
    
          const lv_32fc_t* sourcePtr = source;
    
          uint32_t indices[4] = { 0, 1, 2, 3 };
    
          const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
    
          uint32x4_t vec_indices = vld1q_u32(indices);
    
          uint32x4_t vec_min_indices = vec_indices;
    
          if (num_points) {
    
              float min = FLT_MAX;
    
              uint32_t index = 0;
    
              float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
    
              for (uint32_t number = 0; number < quarter_points; number++) {
    
                  // Load complex and compute magnitude squared
    
                  const float32x4_t vec_mag2 =
    
                      _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
    
                  __VOLK_PREFETCH(sourcePtr += 4);
    
                  // a < b?
    
                  const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
    
                  vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
    
                  vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
    
                  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
    
              }
    
              uint32_t tmp_min_indices[4];
    
              float tmp_min[4];
    
              vst1q_u32(tmp_min_indices, vec_min_indices);
    
              vst1q_f32(tmp_min, vec_min);
    
              for (int i = 0; i < 4; i++) {
    
                  if (tmp_min[i] < min) {
    
                      min = tmp_min[i];
    
                      index = tmp_min_indices[i];
    
                  }
    
              }
    
              // Deal with the rest
    
              for (uint32_t number = quarter_points * 4; number < num_points; number++) {
    
                  const float re = lv_creal(*sourcePtr);
    
                  const float im = lv_cimag(*sourcePtr);
    
                  const float sq_dist = re * re + im * im;
    
                  if (sq_dist < min) {
    
                      min = sq_dist;
    
                      index = number;
    
                  }
    
                  sourcePtr++;
    
              }
    
              *target = index;
    
          }
    
      }
    
      #endif /*LV_HAVE_NEON*/
    
      #endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2021 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_index_min_32u
12			*
13			* \b Overview
14			*
15			* Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
16			* minimum magnitude for complex points in the given vector.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t
21			* num_points) \endcode
22			*
23			* \b Inputs
24			* \li source: The complex input vector.
25			* \li num_points: The number of samples.
26			*
27			* \b Outputs
28			* \li target: The index of the point with minimum magnitude.
29			*
30			* \b Example
31			* Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
32			* the unit circle.
33			* \code
34			* int N = 10;
35			* uint32_t alignment = volk_get_alignment();
36			* lv_32fc_t* in = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
37			* uint32_t* min = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
38			*
39			* for(uint32_t ii = 0; ii < N/2; ++ii){
40			* float real = 2.f * ((float)ii / (float)N) - 1.f;
41			* float imag = std::sqrt(1.f - real * real);
42			* in[ii] = lv_cmake(real, imag);
43			* in[ii] = in[ii] * in[ii] + in[ii];
44			* in[N-ii] = lv_cmake(real, imag);
45			* in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
46			* }
47			*
48			* volk_32fc_index_min_32u(min, in, N);
49			*
50			* printf("index of min value = %u\n", *min);
51			*
52			* volk_free(in);
53			* volk_free(min);
54			* \endcode
55			*/
56
57			#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58			#define INCLUDED_volk_32fc_index_min_32u_a_H
59
60			#include <inttypes.h>
61			#include <stdio.h>
62			#include <volk/volk_common.h>
63			#include <volk/volk_complex.h>
64
65			#ifdef LV_HAVE_AVX2
66			#include <immintrin.h>
67			#include <volk/volk_avx2_intrinsics.h>
68
69		2	static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
70			const lv_32fc_t* source,
71			uint32_t num_points)
72			{
73		2	const __m256i indices_increment = _mm256_set1_epi32(8);
74			/*
75			* At the start of each loop iteration current_indices holds the indices of
76			* the complex numbers loaded from memory. Explanation for odd order is given
77			* in implementation of vector_32fc_index_min_variant0().
78			*/
79		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80
81		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
82		2	__m256i min_indices = _mm256_setzero_si256();
83
84	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
85		32766	__m256 in0 = _mm256_load_ps((float*)source);
86		32766	__m256 in1 = _mm256_load_ps((float*)(source + 4));
87		32766	vector_32fc_index_min_variant0(
88			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
89		32766	source += 8;
90			}
91
92			// determine minimum value and index in the result of the vectorized loop
93			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
94			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
95		2	_mm256_store_ps(min_values_buffer, min_values);
96		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
97
98		2	float min = FLT_MAX;
99		2	uint32_t index = 0;
100	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
101	2/2 ✓ Branch 0 taken 7 times. ✓ Branch 1 taken 9 times.	16	if (min_values_buffer[i] < min) {
102		7	min = min_values_buffer[i];
103		7	index = min_indices_buffer[i];
104			}
105			}
106
107			// handle tail not processed by the vectorized loop
108	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109		14	const float abs_squared =
110		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
111	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
112		✗	min = abs_squared;
113		✗	index = i;
114			}
115		14	++source;
116			}
117
118		2	*target = index;
119		2	}
120
121			#endif /LV_HAVE_AVX2/
122
123			#ifdef LV_HAVE_AVX2
124			#include <immintrin.h>
125			#include <volk/volk_avx2_intrinsics.h>
126
127		2	static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
128			const lv_32fc_t* source,
129			uint32_t num_points)
130			{
131		2	const __m256i indices_increment = _mm256_set1_epi32(8);
132			/*
133			* At the start of each loop iteration current_indices holds the indices of
134			* the complex numbers loaded from memory. Explanation for odd order is given
135			* in implementation of vector_32fc_index_min_variant0().
136			*/
137		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138
139		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
140		2	__m256i min_indices = _mm256_setzero_si256();
141
142	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
143		32766	__m256 in0 = _mm256_load_ps((float*)source);
144		32766	__m256 in1 = _mm256_load_ps((float*)(source + 4));
145		32766	vector_32fc_index_min_variant1(
146			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
147		32766	source += 8;
148			}
149
150			// determine minimum value and index in the result of the vectorized loop
151			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
152			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
153		2	_mm256_store_ps(min_values_buffer, min_values);
154		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
155
156		2	float min = FLT_MAX;
157		2	uint32_t index = 0;
158	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
159	2/2 ✓ Branch 0 taken 7 times. ✓ Branch 1 taken 9 times.	16	if (min_values_buffer[i] < min) {
160		7	min = min_values_buffer[i];
161		7	index = min_indices_buffer[i];
162			}
163			}
164
165			// handle tail not processed by the vectorized loop
166	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167		14	const float abs_squared =
168		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
169	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
170		✗	min = abs_squared;
171		✗	index = i;
172			}
173		14	++source;
174			}
175
176		2	*target = index;
177		2	}
178
179			#endif /LV_HAVE_AVX2/
180
181			#ifdef LV_HAVE_SSE3
182			#include <pmmintrin.h>
183			#include <xmmintrin.h>
184
185		2	static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
186			const lv_32fc_t* source,
187			uint32_t num_points)
188			{
189			union bit128 holderf;
190			union bit128 holderi;
191		2	float sq_dist = 0.0;
192
193			union bit128 xmm5, xmm4;
194			__m128 xmm1, xmm2, xmm3;
195			__m128i xmm8, xmm11, xmm12, xmm9, xmm10;
196
197		2	xmm5.int_vec = _mm_setzero_si128();
198		2	xmm4.int_vec = _mm_setzero_si128();
199		2	holderf.int_vec = _mm_setzero_si128();
200		2	holderi.int_vec = _mm_setzero_si128();
201
202		2	xmm8 = _mm_setr_epi32(0, 1, 2, 3);
203		2	xmm9 = _mm_setzero_si128();
204		2	xmm10 = _mm_setr_epi32(4, 4, 4, 4);
205		2	xmm3 = _mm_set_ps1(FLT_MAX);
206
207		2	int bound = num_points >> 2;
208
209	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (int i = 0; i < bound; ++i) {
210		65534	xmm1 = _mm_load_ps((float*)source);
211		65534	xmm2 = _mm_load_ps((float*)&source[2]);
212
213		65534	source += 4;
214
215		131068	xmm1 = _mm_mul_ps(xmm1, xmm1);
216		65534	xmm2 = _mm_mul_ps(xmm2, xmm2);
217
218		65534	xmm1 = _mm_hadd_ps(xmm1, xmm2);
219
220		65534	xmm3 = _mm_min_ps(xmm1, xmm3);
221
222		65534	xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
223		65534	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
224
225		65534	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
226		131068	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
227
228		65534	xmm9 = _mm_add_epi32(xmm11, xmm12);
229
230		131068	xmm8 = _mm_add_epi32(xmm8, xmm10);
231			}
232
233	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_points >> 1 & 1) {
234		2	xmm2 = _mm_load_ps((float*)source);
235
236		2	xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
237		2	xmm8 = bit128_p(&xmm1)->int_vec;
238
239		2	xmm2 = _mm_mul_ps(xmm2, xmm2);
240
241		2	source += 2;
242
243		2	xmm1 = _mm_hadd_ps(xmm2, xmm2);
244
245		4	xmm3 = _mm_min_ps(xmm1, xmm3);
246
247		2	xmm10 = _mm_setr_epi32(2, 2, 2, 2);
248
249		2	xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
250		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251
252		2	xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253		4	xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254
255		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
256
257		4	xmm8 = _mm_add_epi32(xmm8, xmm10);
258			}
259
260	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (num_points & 1) {
261		2	sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
262		2	lv_cimag(source[0]) * lv_cimag(source[0]);
263
264		2	xmm2 = _mm_load1_ps(&sq_dist);
265
266		2	xmm1 = xmm3;
267
268		2	xmm3 = _mm_min_ss(xmm3, xmm2);
269
270		2	xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
271		2	xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
272
273		2	xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
274
275		2	xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
276		4	xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
277
278		2	xmm9 = _mm_add_epi32(xmm11, xmm12);
279			}
280
281			_mm_store_ps((float*)&(holderf.f), xmm3);
282			_mm_store_si128(&(holderi.int_vec), xmm9);
283
284		2	target[0] = holderi.i[0];
285		2	sq_dist = holderf.f[0];
286	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
287	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
288	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
289	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
290	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
291	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 1 times.	2	sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
292		2	}
293
294			#endif /LV_HAVE_SSE3/
295
296			#ifdef LV_HAVE_GENERIC
297		2	static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
298			const lv_32fc_t* source,
299			uint32_t num_points)
300			{
301		2	float sq_dist = 0.0;
302		2	float min = FLT_MAX;
303		2	uint32_t index = 0;
304
305	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (uint32_t i = 0; i < num_points; ++i) {
306		262142	sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
307		262142	lv_cimag(source[i]) * lv_cimag(source[i]);
308
309	2/2 ✓ Branch 0 taken 34 times. ✓ Branch 1 taken 262108 times.	262142	if (sq_dist < min) {
310		34	index = i;
311		34	min = sq_dist;
312			}
313			}
314		2	target[0] = index;
315		2	}
316
317			#endif /LV_HAVE_GENERIC/
318
319			#endif /INCLUDED_volk_32fc_index_min_32u_a_H/
320
321			#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
322			#define INCLUDED_volk_32fc_index_min_32u_u_H
323
324			#include <inttypes.h>
325			#include <stdio.h>
326			#include <volk/volk_common.h>
327			#include <volk/volk_complex.h>
328
329			#ifdef LV_HAVE_AVX2
330			#include <immintrin.h>
331			#include <volk/volk_avx2_intrinsics.h>
332
333		2	static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
334			const lv_32fc_t* source,
335			uint32_t num_points)
336			{
337		2	const __m256i indices_increment = _mm256_set1_epi32(8);
338			/*
339			* At the start of each loop iteration current_indices holds the indices of
340			* the complex numbers loaded from memory. Explanation for odd order is given
341			* in implementation of vector_32fc_index_min_variant0().
342			*/
343		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
344
345		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
346		2	__m256i min_indices = _mm256_setzero_si256();
347
348	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
349		32766	__m256 in0 = _mm256_loadu_ps((float*)source);
350		32766	__m256 in1 = _mm256_loadu_ps((float*)(source + 4));
351		32766	vector_32fc_index_min_variant0(
352			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
353		32766	source += 8;
354			}
355
356			// determine minimum value and index in the result of the vectorized loop
357			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
358			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
359		2	_mm256_store_ps(min_values_buffer, min_values);
360		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
361
362		2	float min = FLT_MAX;
363		2	uint32_t index = 0;
364	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
365	2/2 ✓ Branch 0 taken 7 times. ✓ Branch 1 taken 9 times.	16	if (min_values_buffer[i] < min) {
366		7	min = min_values_buffer[i];
367		7	index = min_indices_buffer[i];
368			}
369			}
370
371			// handle tail not processed by the vectorized loop
372	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
373		14	const float abs_squared =
374		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
375	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
376		✗	min = abs_squared;
377		✗	index = i;
378			}
379		14	++source;
380			}
381
382		2	*target = index;
383		2	}
384
385			#endif /LV_HAVE_AVX2/
386
387			#ifdef LV_HAVE_AVX2
388			#include <immintrin.h>
389			#include <volk/volk_avx2_intrinsics.h>
390
391		2	static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
392			const lv_32fc_t* source,
393			uint32_t num_points)
394			{
395		2	const __m256i indices_increment = _mm256_set1_epi32(8);
396			/*
397			* At the start of each loop iteration current_indices holds the indices of
398			* the complex numbers loaded from memory. Explanation for odd order is given
399			* in implementation of vector_32fc_index_min_variant0().
400			*/
401		2	__m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
402
403		2	__m256 min_values = _mm256_set1_ps(FLT_MAX);
404		2	__m256i min_indices = _mm256_setzero_si256();
405
406	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned i = 0; i < num_points / 8u; ++i) {
407		32766	__m256 in0 = _mm256_loadu_ps((float*)source);
408		32766	__m256 in1 = _mm256_loadu_ps((float*)(source + 4));
409		32766	vector_32fc_index_min_variant1(
410			in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
411		32766	source += 8;
412			}
413
414			// determine minimum value and index in the result of the vectorized loop
415			__VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
416			__VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
417		2	_mm256_store_ps(min_values_buffer, min_values);
418		2	_mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
419
420		2	float min = FLT_MAX;
421		2	uint32_t index = 0;
422	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (unsigned i = 0; i < 8; i++) {
423	2/2 ✓ Branch 0 taken 7 times. ✓ Branch 1 taken 9 times.	16	if (min_values_buffer[i] < min) {
424		7	min = min_values_buffer[i];
425		7	index = min_indices_buffer[i];
426			}
427			}
428
429			// handle tail not processed by the vectorized loop
430	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (unsigned i = num_points & (~7u); i < num_points; ++i) {
431		14	const float abs_squared =
432		14	lv_creal(source) lv_creal(source) + lv_cimag(source) * lv_cimag(*source);
433	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (abs_squared < min) {
434		✗	min = abs_squared;
435		✗	index = i;
436			}
437		14	++source;
438			}
439
440		2	*target = index;
441		2	}
442
443			#endif /LV_HAVE_AVX2/
444
445			#ifdef LV_HAVE_NEON
446			#include <arm_neon.h>
447			#include <volk/volk_neon_intrinsics.h>
448
449			static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
450			const lv_32fc_t* source,
451			uint32_t num_points)
452			{
453			const uint32_t quarter_points = num_points / 4;
454			const lv_32fc_t* sourcePtr = source;
455
456			uint32_t indices[4] = { 0, 1, 2, 3 };
457			const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
458			uint32x4_t vec_indices = vld1q_u32(indices);
459			uint32x4_t vec_min_indices = vec_indices;
460
461			if (num_points) {
462			float min = FLT_MAX;
463			uint32_t index = 0;
464
465			float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
466
467			for (uint32_t number = 0; number < quarter_points; number++) {
468			// Load complex and compute magnitude squared
469			const float32x4_t vec_mag2 =
470			_vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
471			__VOLK_PREFETCH(sourcePtr += 4);
472			// a < b?
473			const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
474			vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
475			vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
476			vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
477			}
478			uint32_t tmp_min_indices[4];
479			float tmp_min[4];
480			vst1q_u32(tmp_min_indices, vec_min_indices);
481			vst1q_f32(tmp_min, vec_min);
482
483			for (int i = 0; i < 4; i++) {
484			if (tmp_min[i] < min) {
485			min = tmp_min[i];
486			index = tmp_min_indices[i];
487			}
488			}
489
490			// Deal with the rest
491			for (uint32_t number = quarter_points * 4; number < num_points; number++) {
492			const float re = lv_creal(*sourcePtr);
493			const float im = lv_cimag(*sourcePtr);
494			const float sq_dist = re * re + im * im;
495			if (sq_dist < min) {
496			min = sq_dist;
497			index = number;
498			}
499			sourcePtr++;
500			}
501			*target = index;
502			}
503			}
504
505			#endif /LV_HAVE_NEON/
506
507			#endif /INCLUDED_volk_32fc_index_min_32u_u_H/
508