GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_s32f_convert_16i.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	240	260	92.3%
Functions:	10	10	100.0%
Branches:	56	76	73.7%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_s32f_convert_16i
    
       *
    
       * \b Overview
    
       *
    
       * Converts a floating point number to a 16-bit short after applying a
    
       * scaling factor.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const
    
       * float scalar, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li inputVector: the input vector of floats.
    
       * \li scalar: The value multiplied against each point in the input buffer.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output vector.
    
       *
    
       * \b Example
    
       * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
    
       * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing =
    
       * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out =
    
       * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
    
       *   }
    
       *
    
       *   // Normalize by the smallest delta (0.2 in this example)
    
       *   float scale = 5.f;
    
       *
    
       *   volk_32f_s32f_convert_32i(out, increasing, scale, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %i\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
    
      #define INCLUDED_volk_32f_s32f_convert_16i_u_H
    
      #include <inttypes.h>
    
      #include <limits.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
    
                                                          const float* inputVector,
    
                                                          const float scalar,
    
                                                          unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m256 vScalar = _mm256_set1_ps(scalar);
    
          __m256 inputVal1, inputVal2;
    
          __m256i intInputVal1, intInputVal2;
    
          __m256 ret1, ret2;
    
      2
          __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          __m256 vmax_val = _mm256_set1_ps(max_val);
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              inputVal1 = _mm256_loadu_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 8;
    
      16382
              inputVal2 = _mm256_loadu_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 8;
    
              // Scale and clip
    
      49146
              ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
    
                                   vmin_val);
    
      49146
              ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
    
                                   vmin_val);
    
      16382
              intInputVal1 = _mm256_cvtps_epi32(ret1);
    
      16382
              intInputVal2 = _mm256_cvtps_epi32(ret2);
    
      16382
              intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
    
      16382
              intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
    
              _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
    
      16382
              outputVectorPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.

      30
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.

      30
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      30
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m256 vScalar = _mm256_set1_ps(scalar);
    
          __m256 inputVal, ret;
    
          __m256i intInputVal;
    
          __m128i intInputVal1, intInputVal2;
    
      2
          __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          __m256 vmax_val = _mm256_set1_ps(max_val);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              inputVal = _mm256_loadu_ps(inputVectorPtr);
    
      32766
              inputVectorPtr += 8;
    
              // Scale and clip
    
      98298
              ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
    
                                  vmin_val);
    
      32766
              intInputVal = _mm256_cvtps_epi32(ret);
    
      32766
              intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
    
      32766
              intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
    
      32766
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
              _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      32766
              outputVectorPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      14
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
    
                                                          const float* inputVector,
    
                                                          const float scalar,
    
                                                          unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m128 vScalar = _mm_set_ps1(scalar);
    
          __m128 inputVal1, inputVal2;
    
          __m128i intInputVal1, intInputVal2;
    
          __m128 ret1, ret2;
    
      2
          __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          __m128 vmax_val = _mm_set_ps1(max_val);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              inputVal1 = _mm_loadu_ps(inputVectorPtr);
    
      32766
              inputVectorPtr += 4;
    
      32766
              inputVal2 = _mm_loadu_ps(inputVectorPtr);
    
      32766
              inputVectorPtr += 4;
    
              // Scale and clip
    
      98298
              ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    
      98298
              ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    
      32766
              intInputVal1 = _mm_cvtps_epi32(ret1);
    
      32766
              intInputVal2 = _mm_cvtps_epi32(ret2);
    
      32766
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
              _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      32766
              outputVectorPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      14
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m128 vScalar = _mm_set_ps1(scalar);
    
          __m128 ret;
    
      2
          __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          __m128 vmax_val = _mm_set_ps1(max_val);
    
          __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              ret = _mm_loadu_ps(inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
              // Scale and clip
    
      196602
              ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
    
              _mm_store_ps(outputFloatBuffer, ret);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      6
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
    
                                                           const float* inputVector,
    
                                                           const float scalar,
    
                                                           unsigned int num_points)
    
      {
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          const float* inputVectorPtr = inputVector;
    
      2
          unsigned int number = 0;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              r = *inputVectorPtr++ * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.

      262142
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.

      262142
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      262142
              *outputVectorPtr++ = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
    
      #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
    
      #define INCLUDED_volk_32f_s32f_convert_16i_a_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
    
                                                          const float* inputVector,
    
                                                          const float scalar,
    
                                                          unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m256 vScalar = _mm256_set1_ps(scalar);
    
          __m256 inputVal1, inputVal2;
    
          __m256i intInputVal1, intInputVal2;
    
          __m256 ret1, ret2;
    
      2
          __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          __m256 vmax_val = _mm256_set1_ps(max_val);
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              inputVal1 = _mm256_load_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 8;
    
      16382
              inputVal2 = _mm256_load_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 8;
    
              // Scale and clip
    
      49146
              ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
    
                                   vmin_val);
    
      49146
              ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
    
                                   vmin_val);
    
      16382
              intInputVal1 = _mm256_cvtps_epi32(ret1);
    
      16382
              intInputVal2 = _mm256_cvtps_epi32(ret2);
    
      16382
              intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
    
      16382
              intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
    
              _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
    
      16382
              outputVectorPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.

      30
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.

      30
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      30
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m256 vScalar = _mm256_set1_ps(scalar);
    
          __m256 inputVal, ret;
    
          __m256i intInputVal;
    
          __m128i intInputVal1, intInputVal2;
    
      2
          __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          __m256 vmax_val = _mm256_set1_ps(max_val);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              inputVal = _mm256_load_ps(inputVectorPtr);
    
      32766
              inputVectorPtr += 8;
    
              // Scale and clip
    
      98298
              ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
    
                                  vmin_val);
    
      32766
              intInputVal = _mm256_cvtps_epi32(ret);
    
      32766
              intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
    
      32766
              intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
    
      32766
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
              _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      32766
              outputVectorPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      14
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
    
                                                          const float* inputVector,
    
                                                          const float scalar,
    
                                                          unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m128 vScalar = _mm_set_ps1(scalar);
    
          __m128 inputVal1, inputVal2;
    
          __m128i intInputVal1, intInputVal2;
    
          __m128 ret1, ret2;
    
      2
          __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          __m128 vmax_val = _mm_set_ps1(max_val);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              inputVal1 = _mm_load_ps(inputVectorPtr);
    
      32766
              inputVectorPtr += 4;
    
      32766
              inputVal2 = _mm_load_ps(inputVectorPtr);
    
      32766
              inputVectorPtr += 4;
    
              // Scale and clip
    
      98298
              ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    
      98298
              ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    
      32766
              intInputVal1 = _mm_cvtps_epi32(ret1);
    
      32766
              intInputVal2 = _mm_cvtps_epi32(ret2);
    
      32766
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
              _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      32766
              outputVectorPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.

      14
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      14
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
      2
          __m128 vScalar = _mm_set_ps1(scalar);
    
          __m128 ret;
    
      2
          __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          __m128 vmax_val = _mm_set_ps1(max_val);
    
          __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              ret = _mm_load_ps(inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
              // Scale and clip
    
      196602
              ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
    
              _mm_store_ps(outputFloatBuffer, ret);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
    
      65534
              *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              r = inputVector[number] * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
              if (r > max_val)
    
      ✗
                  r = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
              else if (r < min_val)
    
      ✗
                  r = min_val;
    
      6
              outputVector[number] = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
    
                                                             const float* inputVector,
    
                                                             const float scalar,
    
                                                             unsigned int num_points)
    
      {
    
      2
          int16_t* outputVectorPtr = outputVector;
    
      2
          const float* inputVectorPtr = inputVector;
    
      2
          unsigned int number = 0;
    
      2
          float min_val = SHRT_MIN;
    
      2
          float max_val = SHRT_MAX;
    
          float r;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              r = *inputVectorPtr++ * scalar;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.

      262142
              if (r < min_val)
    
      ✗
                  r = min_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262142 times.

      262142
              else if (r > max_val)
    
      ✗
                  r = max_val;
    
      262142
              *outputVectorPtr++ = (int16_t)rintf(r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_s32f_convert_16i
12			*
13			* \b Overview
14			*
15			* Converts a floating point number to a 16-bit short after applying a
16			* scaling factor.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const
21			* float scalar, unsigned int num_points) \endcode
22			*
23			* \b Inputs
24			* \li inputVector: the input vector of floats.
25			* \li scalar: The value multiplied against each point in the input buffer.
26			* \li num_points: The number of data points.
27			*
28			* \b Outputs
29			* \li outputVector: The output vector.
30			*
31			* \b Example
32			* Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
33			* delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing =
34			* (float)volk_malloc(sizeof(float)N, alignment); int16_t* out =
35			* (int16_t)volk_malloc(sizeof(int16_t)N, alignment);
36			*
37			* for(unsigned int ii = 0; ii < N; ++ii){
38			* increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
39			* }
40			*
41			* // Normalize by the smallest delta (0.2 in this example)
42			* float scale = 5.f;
43			*
44			* volk_32f_s32f_convert_32i(out, increasing, scale, N);
45			*
46			* for(unsigned int ii = 0; ii < N; ++ii){
47			* printf("out[%u] = %i\n", ii, out[ii]);
48			* }
49			*
50			* volk_free(increasing);
51			* volk_free(out);
52			* \endcode
53			*/
54
55			#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
56			#define INCLUDED_volk_32f_s32f_convert_16i_u_H
57
58			#include <inttypes.h>
59			#include <limits.h>
60			#include <stdio.h>
61
62			#ifdef LV_HAVE_AVX2
63			#include <immintrin.h>
64
65		2	static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
66			const float* inputVector,
67			const float scalar,
68			unsigned int num_points)
69			{
70		2	unsigned int number = 0;
71
72		2	const unsigned int sixteenthPoints = num_points / 16;
73
74		2	const float* inputVectorPtr = (const float*)inputVector;
75		2	int16_t* outputVectorPtr = outputVector;
76
77		2	float min_val = SHRT_MIN;
78		2	float max_val = SHRT_MAX;
79			float r;
80
81		2	__m256 vScalar = _mm256_set1_ps(scalar);
82			__m256 inputVal1, inputVal2;
83			__m256i intInputVal1, intInputVal2;
84			__m256 ret1, ret2;
85		2	__m256 vmin_val = _mm256_set1_ps(min_val);
86		2	__m256 vmax_val = _mm256_set1_ps(max_val);
87
88	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
89		16382	inputVal1 = _mm256_loadu_ps(inputVectorPtr);
90		16382	inputVectorPtr += 8;
91		16382	inputVal2 = _mm256_loadu_ps(inputVectorPtr);
92		16382	inputVectorPtr += 8;
93
94			// Scale and clip
95		49146	ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
96			vmin_val);
97		49146	ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
98			vmin_val);
99
100		16382	intInputVal1 = _mm256_cvtps_epi32(ret1);
101		16382	intInputVal2 = _mm256_cvtps_epi32(ret2);
102
103		16382	intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
104		16382	intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
105
106			_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107		16382	outputVectorPtr += 16;
108			}
109
110		2	number = sixteenthPoints * 16;
111	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
112		30	r = inputVector[number] * scalar;
113	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 30 times.	30	if (r > max_val)
114		✗	r = max_val;
115	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 30 times.	30	else if (r < min_val)
116		✗	r = min_val;
117		30	outputVector[number] = (int16_t)rintf(r);
118			}
119		2	}
120			#endif /* LV_HAVE_AVX2 */
121
122
123			#ifdef LV_HAVE_AVX
124			#include <immintrin.h>
125
126		2	static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
127			const float* inputVector,
128			const float scalar,
129			unsigned int num_points)
130			{
131		2	unsigned int number = 0;
132
133		2	const unsigned int eighthPoints = num_points / 8;
134
135		2	const float* inputVectorPtr = (const float*)inputVector;
136		2	int16_t* outputVectorPtr = outputVector;
137
138		2	float min_val = SHRT_MIN;
139		2	float max_val = SHRT_MAX;
140			float r;
141
142		2	__m256 vScalar = _mm256_set1_ps(scalar);
143			__m256 inputVal, ret;
144			__m256i intInputVal;
145			__m128i intInputVal1, intInputVal2;
146		2	__m256 vmin_val = _mm256_set1_ps(min_val);
147		2	__m256 vmax_val = _mm256_set1_ps(max_val);
148
149	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
150		32766	inputVal = _mm256_loadu_ps(inputVectorPtr);
151		32766	inputVectorPtr += 8;
152
153			// Scale and clip
154		98298	ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
155			vmin_val);
156
157		32766	intInputVal = _mm256_cvtps_epi32(ret);
158
159		32766	intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
160		32766	intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
161
162		32766	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
163
164			_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
165		32766	outputVectorPtr += 8;
166			}
167
168		2	number = eighthPoints * 8;
169	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
170		14	r = inputVector[number] * scalar;
171	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (r > max_val)
172		✗	r = max_val;
173	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	else if (r < min_val)
174		✗	r = min_val;
175		14	outputVector[number] = (int16_t)rintf(r);
176			}
177		2	}
178			#endif /* LV_HAVE_AVX */
179
180
181			#ifdef LV_HAVE_SSE2
182			#include <emmintrin.h>
183
184		2	static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
185			const float* inputVector,
186			const float scalar,
187			unsigned int num_points)
188			{
189		2	unsigned int number = 0;
190
191		2	const unsigned int eighthPoints = num_points / 8;
192
193		2	const float* inputVectorPtr = (const float*)inputVector;
194		2	int16_t* outputVectorPtr = outputVector;
195
196		2	float min_val = SHRT_MIN;
197		2	float max_val = SHRT_MAX;
198			float r;
199
200		2	__m128 vScalar = _mm_set_ps1(scalar);
201			__m128 inputVal1, inputVal2;
202			__m128i intInputVal1, intInputVal2;
203			__m128 ret1, ret2;
204		2	__m128 vmin_val = _mm_set_ps1(min_val);
205		2	__m128 vmax_val = _mm_set_ps1(max_val);
206
207	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
208		32766	inputVal1 = _mm_loadu_ps(inputVectorPtr);
209		32766	inputVectorPtr += 4;
210		32766	inputVal2 = _mm_loadu_ps(inputVectorPtr);
211		32766	inputVectorPtr += 4;
212
213			// Scale and clip
214		98298	ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
215		98298	ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
216
217		32766	intInputVal1 = _mm_cvtps_epi32(ret1);
218		32766	intInputVal2 = _mm_cvtps_epi32(ret2);
219
220		32766	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
221
222			_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
223		32766	outputVectorPtr += 8;
224			}
225
226		2	number = eighthPoints * 8;
227	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
228		14	r = inputVector[number] * scalar;
229	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (r > max_val)
230		✗	r = max_val;
231	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	else if (r < min_val)
232		✗	r = min_val;
233		14	outputVector[number] = (int16_t)rintf(r);
234			}
235		2	}
236			#endif /* LV_HAVE_SSE2 */
237
238
239			#ifdef LV_HAVE_SSE
240			#include <xmmintrin.h>
241
242		2	static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
243			const float* inputVector,
244			const float scalar,
245			unsigned int num_points)
246			{
247		2	unsigned int number = 0;
248
249		2	const unsigned int quarterPoints = num_points / 4;
250
251		2	const float* inputVectorPtr = (const float*)inputVector;
252		2	int16_t* outputVectorPtr = outputVector;
253
254		2	float min_val = SHRT_MIN;
255		2	float max_val = SHRT_MAX;
256			float r;
257
258		2	__m128 vScalar = _mm_set_ps1(scalar);
259			__m128 ret;
260		2	__m128 vmin_val = _mm_set_ps1(min_val);
261		2	__m128 vmax_val = _mm_set_ps1(max_val);
262
263			__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
264
265	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
266		65534	ret = _mm_loadu_ps(inputVectorPtr);
267		65534	inputVectorPtr += 4;
268
269			// Scale and clip
270		196602	ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
271
272			_mm_store_ps(outputFloatBuffer, ret);
273		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
274		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
275		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
276		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
277			}
278
279		2	number = quarterPoints * 4;
280	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
281		6	r = inputVector[number] * scalar;
282	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	if (r > max_val)
283		✗	r = max_val;
284	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	else if (r < min_val)
285		✗	r = min_val;
286		6	outputVector[number] = (int16_t)rintf(r);
287			}
288		2	}
289			#endif /* LV_HAVE_SSE */
290
291
292			#ifdef LV_HAVE_GENERIC
293
294		2	static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
295			const float* inputVector,
296			const float scalar,
297			unsigned int num_points)
298			{
299		2	int16_t* outputVectorPtr = outputVector;
300		2	const float* inputVectorPtr = inputVector;
301		2	unsigned int number = 0;
302		2	float min_val = SHRT_MIN;
303		2	float max_val = SHRT_MAX;
304			float r;
305
306	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
307		262142	r = inputVectorPtr++ scalar;
308	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 262142 times.	262142	if (r > max_val)
309		✗	r = max_val;
310	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 262142 times.	262142	else if (r < min_val)
311		✗	r = min_val;
312		262142	*outputVectorPtr++ = (int16_t)rintf(r);
313			}
314		2	}
315			#endif /* LV_HAVE_GENERIC */
316
317
318			#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
319			#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
320			#define INCLUDED_volk_32f_s32f_convert_16i_a_H
321
322			#include <inttypes.h>
323			#include <math.h>
324			#include <stdio.h>
325			#include <volk/volk_common.h>
326
327			#ifdef LV_HAVE_AVX2
328			#include <immintrin.h>
329
330		2	static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
331			const float* inputVector,
332			const float scalar,
333			unsigned int num_points)
334			{
335		2	unsigned int number = 0;
336
337		2	const unsigned int sixteenthPoints = num_points / 16;
338
339		2	const float* inputVectorPtr = (const float*)inputVector;
340		2	int16_t* outputVectorPtr = outputVector;
341
342		2	float min_val = SHRT_MIN;
343		2	float max_val = SHRT_MAX;
344			float r;
345
346		2	__m256 vScalar = _mm256_set1_ps(scalar);
347			__m256 inputVal1, inputVal2;
348			__m256i intInputVal1, intInputVal2;
349			__m256 ret1, ret2;
350		2	__m256 vmin_val = _mm256_set1_ps(min_val);
351		2	__m256 vmax_val = _mm256_set1_ps(max_val);
352
353	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
354		16382	inputVal1 = _mm256_load_ps(inputVectorPtr);
355		16382	inputVectorPtr += 8;
356		16382	inputVal2 = _mm256_load_ps(inputVectorPtr);
357		16382	inputVectorPtr += 8;
358
359			// Scale and clip
360		49146	ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
361			vmin_val);
362		49146	ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
363			vmin_val);
364
365		16382	intInputVal1 = _mm256_cvtps_epi32(ret1);
366		16382	intInputVal2 = _mm256_cvtps_epi32(ret2);
367
368		16382	intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
369		16382	intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
370
371			_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
372		16382	outputVectorPtr += 16;
373			}
374
375		2	number = sixteenthPoints * 16;
376	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
377		30	r = inputVector[number] * scalar;
378	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 30 times.	30	if (r > max_val)
379		✗	r = max_val;
380	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 30 times.	30	else if (r < min_val)
381		✗	r = min_val;
382		30	outputVector[number] = (int16_t)rintf(r);
383			}
384		2	}
385			#endif /* LV_HAVE_AVX2 */
386
387
388			#ifdef LV_HAVE_AVX
389			#include <immintrin.h>
390
391		2	static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
392			const float* inputVector,
393			const float scalar,
394			unsigned int num_points)
395			{
396		2	unsigned int number = 0;
397
398		2	const unsigned int eighthPoints = num_points / 8;
399
400		2	const float* inputVectorPtr = (const float*)inputVector;
401		2	int16_t* outputVectorPtr = outputVector;
402
403		2	float min_val = SHRT_MIN;
404		2	float max_val = SHRT_MAX;
405			float r;
406
407		2	__m256 vScalar = _mm256_set1_ps(scalar);
408			__m256 inputVal, ret;
409			__m256i intInputVal;
410			__m128i intInputVal1, intInputVal2;
411		2	__m256 vmin_val = _mm256_set1_ps(min_val);
412		2	__m256 vmax_val = _mm256_set1_ps(max_val);
413
414	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
415		32766	inputVal = _mm256_load_ps(inputVectorPtr);
416		32766	inputVectorPtr += 8;
417
418			// Scale and clip
419		98298	ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
420			vmin_val);
421
422		32766	intInputVal = _mm256_cvtps_epi32(ret);
423
424		32766	intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
425		32766	intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
426
427		32766	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
428
429			_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
430		32766	outputVectorPtr += 8;
431			}
432
433		2	number = eighthPoints * 8;
434	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
435		14	r = inputVector[number] * scalar;
436	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (r > max_val)
437		✗	r = max_val;
438	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	else if (r < min_val)
439		✗	r = min_val;
440		14	outputVector[number] = (int16_t)rintf(r);
441			}
442		2	}
443			#endif /* LV_HAVE_AVX */
444
445			#ifdef LV_HAVE_SSE2
446			#include <emmintrin.h>
447
448		2	static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
449			const float* inputVector,
450			const float scalar,
451			unsigned int num_points)
452			{
453		2	unsigned int number = 0;
454
455		2	const unsigned int eighthPoints = num_points / 8;
456
457		2	const float* inputVectorPtr = (const float*)inputVector;
458		2	int16_t* outputVectorPtr = outputVector;
459
460		2	float min_val = SHRT_MIN;
461		2	float max_val = SHRT_MAX;
462			float r;
463
464		2	__m128 vScalar = _mm_set_ps1(scalar);
465			__m128 inputVal1, inputVal2;
466			__m128i intInputVal1, intInputVal2;
467			__m128 ret1, ret2;
468		2	__m128 vmin_val = _mm_set_ps1(min_val);
469		2	__m128 vmax_val = _mm_set_ps1(max_val);
470
471	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
472		32766	inputVal1 = _mm_load_ps(inputVectorPtr);
473		32766	inputVectorPtr += 4;
474		32766	inputVal2 = _mm_load_ps(inputVectorPtr);
475		32766	inputVectorPtr += 4;
476
477			// Scale and clip
478		98298	ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
479		98298	ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
480
481		32766	intInputVal1 = _mm_cvtps_epi32(ret1);
482		32766	intInputVal2 = _mm_cvtps_epi32(ret2);
483
484		32766	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
485
486			_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
487		32766	outputVectorPtr += 8;
488			}
489
490		2	number = eighthPoints * 8;
491	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
492		14	r = inputVector[number] * scalar;
493	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	if (r > max_val)
494		✗	r = max_val;
495	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 14 times.	14	else if (r < min_val)
496		✗	r = min_val;
497		14	outputVector[number] = (int16_t)rintf(r);
498			}
499		2	}
500			#endif /* LV_HAVE_SSE2 */
501
502
503			#ifdef LV_HAVE_SSE
504			#include <xmmintrin.h>
505
506		2	static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
507			const float* inputVector,
508			const float scalar,
509			unsigned int num_points)
510			{
511		2	unsigned int number = 0;
512
513		2	const unsigned int quarterPoints = num_points / 4;
514
515		2	const float* inputVectorPtr = (const float*)inputVector;
516		2	int16_t* outputVectorPtr = outputVector;
517
518		2	float min_val = SHRT_MIN;
519		2	float max_val = SHRT_MAX;
520			float r;
521
522		2	__m128 vScalar = _mm_set_ps1(scalar);
523			__m128 ret;
524		2	__m128 vmin_val = _mm_set_ps1(min_val);
525		2	__m128 vmax_val = _mm_set_ps1(max_val);
526
527			__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
528
529	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
530		65534	ret = _mm_load_ps(inputVectorPtr);
531		65534	inputVectorPtr += 4;
532
533			// Scale and clip
534		196602	ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
535
536			_mm_store_ps(outputFloatBuffer, ret);
537		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
538		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
539		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
540		65534	*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
541			}
542
543		2	number = quarterPoints * 4;
544	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
545		6	r = inputVector[number] * scalar;
546	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	if (r > max_val)
547		✗	r = max_val;
548	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	else if (r < min_val)
549		✗	r = min_val;
550		6	outputVector[number] = (int16_t)rintf(r);
551			}
552		2	}
553			#endif /* LV_HAVE_SSE */
554
555
556			#ifdef LV_HAVE_GENERIC
557
558		2	static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
559			const float* inputVector,
560			const float scalar,
561			unsigned int num_points)
562			{
563		2	int16_t* outputVectorPtr = outputVector;
564		2	const float* inputVectorPtr = inputVector;
565		2	unsigned int number = 0;
566		2	float min_val = SHRT_MIN;
567		2	float max_val = SHRT_MAX;
568			float r;
569
570	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
571		262142	r = inputVectorPtr++ scalar;
572	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 262142 times.	262142	if (r < min_val)
573		✗	r = min_val;
574	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 262142 times.	262142	else if (r > max_val)
575		✗	r = max_val;
576		262142	*outputVectorPtr++ = (int16_t)rintf(r);
577			}
578		2	}
579			#endif /* LV_HAVE_GENERIC */
580
581			#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
582