GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_s32f_convert_8i.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	195	195	100.0%
Functions:	8	8	100.0%
Branches:	34	34	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_s32f_convert_8i
    
       *
    
       * \b Overview
    
       *
    
       * Converts a floating point number to a 8-bit int after applying a
    
       * scaling factor.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
    
       float scalar, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li inputVector: the input vector of floats.
    
       * \li scalar: The value multiplied against each point in the input buffer.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output vector.
    
       *
    
       * \b Example
    
       * Convert floats from [-1,1] to 8-bit integers with a scale of 5 to maintain smallest
    
       delta
    
       *  int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
    
       *   }
    
       *
    
       *   // Normalize by the smallest delta (0.2 in this example)
    
       *   // With float -> 8 bit ints be careful of scaling
    
       *   float scale = 5.1f;
    
       *
    
       *   volk_32f_s32f_convert_8i(out, increasing, scale, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %i\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
    
      #define INCLUDED_volk_32f_s32f_convert_8i_u_H
    
      #include <inttypes.h>
    
      262338
      static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
    
      {
    
      262338
          const float min_val = INT8_MIN;
    
      262338
          const float max_val = INT8_MAX;
    
        2/2✓ Branch 0 taken 79941 times.
✓ Branch 1 taken 182397 times.

      262338
          if (in > max_val) {
    
      79941
              *out = (int8_t)(max_val);
    
        2/2✓ Branch 0 taken 79659 times.
✓ Branch 1 taken 102738 times.

      182397
          } else if (in < min_val) {
    
      79659
              *out = (int8_t)(min_val);
    
          } else {
    
      102738
              *out = (int8_t)(rintf(in));
    
          }
    
      262338
      }
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
    
                                                          const float* inputVector,
    
                                                          const float scalar,
    
                                                          unsigned int num_points)
    
      {
    
      2
          const float* inputVectorPtr = inputVector;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (unsigned int number = 0; number < num_points; number++) {
    
      262142
              const float r = *inputVectorPtr++ * scalar;
    
      262142
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          const unsigned int thirtysecondPoints = num_points / 32;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int8_t* outputVectorPtr = outputVector;
    
      2
          const float min_val = INT8_MIN;
    
      2
          const float max_val = INT8_MAX;
    
      2
          const __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          const __m256 vmax_val = _mm256_set1_ps(max_val);
    
      2
          const __m256 vScalar = _mm256_set1_ps(scalar);
    
        2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.

      8192
          for (unsigned int number = 0; number < thirtysecondPoints; number++) {
    
      8190
              __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      8190
              __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      8190
              __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      8190
              __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      24570
              inputVal1 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    
      24570
              inputVal2 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    
      24570
              inputVal3 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
    
      24570
              inputVal4 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
    
      8190
              __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
    
      8190
              __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
    
      8190
              __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
    
      8190
              __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
    
      8190
              intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
    
      8190
              intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
    
      8190
              intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
    
      8190
              intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
    
      8190
              intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
    
      8190
              const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
    
              _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
    
      8190
              outputVectorPtr += 32;
    
          }
    
        2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.

      64
          for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
    
      62
              float r = inputVector[number] * scalar;
    
      62
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int8_t* outputVectorPtr = outputVector;
    
      2
          const float min_val = INT8_MIN;
    
      2
          const float max_val = INT8_MAX;
    
      2
          const __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          const __m128 vmax_val = _mm_set_ps1(max_val);
    
      2
          const __m128 vScalar = _mm_set_ps1(scalar);
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned int number = 0; number < sixteenthPoints; number++) {
    
      16382
              __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
      16382
              __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
      16382
              __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
      16382
              __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
              inputVal1 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    
              inputVal2 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    
              inputVal3 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
    
              inputVal4 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
    
      16382
              __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
    
      16382
              __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
    
      16382
              __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
    
      16382
              __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
    
      16382
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
      16382
              intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
    
      16382
              intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
    
              _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      16382
              outputVectorPtr += 16;
    
          }
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
    
      30
              const float r = inputVector[number] * scalar;
    
      30
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
    
                                                        const float* inputVector,
    
                                                        const float scalar,
    
                                                        unsigned int num_points)
    
      {
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int8_t* outputVectorPtr = outputVector;
    
      2
          const float min_val = INT8_MIN;
    
      2
          const float max_val = INT8_MAX;
    
      2
          const __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          const __m128 vmax_val = _mm_set_ps1(max_val);
    
      2
          const __m128 vScalar = _mm_set_ps1(scalar);
    
          __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (unsigned int number = 0; number < quarterPoints; number++) {
    
      65534
              __m128 ret = _mm_loadu_ps(inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
      196602
              ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
    
              _mm_store_ps(outputFloatBuffer, ret);
    
        2/2✓ Branch 0 taken 262136 times.
✓ Branch 1 taken 65534 times.

      327670
              for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
    
      262136
                  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
    
              }
    
          }
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
    
      6
              const float r = inputVector[number] * scalar;
    
      6
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
    
      #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
    
      #define INCLUDED_volk_32f_s32f_convert_8i_a_H
    
      #include <inttypes.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          const unsigned int thirtysecondPoints = num_points / 32;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int8_t* outputVectorPtr = outputVector;
    
      2
          const float min_val = INT8_MIN;
    
      2
          const float max_val = INT8_MAX;
    
      2
          const __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          const __m256 vmax_val = _mm256_set1_ps(max_val);
    
      2
          const __m256 vScalar = _mm256_set1_ps(scalar);
    
        2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.

      8192
          for (unsigned int number = 0; number < thirtysecondPoints; number++) {
    
      8190
              __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      8190
              __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      8190
              __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      8190
              __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
    
      8190
              inputVectorPtr += 8;
    
      24570
              inputVal1 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    
      24570
              inputVal2 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    
      24570
              inputVal3 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
    
      32760
              inputVal4 = _mm256_max_ps(
    
                  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
    
      8190
              __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
    
      8190
              __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
    
      8190
              __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
    
      8190
              __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
    
      8190
              intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
    
      8190
              intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
    
      8190
              intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
    
      8190
              intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
    
      8190
              intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
    
      8190
              __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
    
              _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
    
      8190
              outputVectorPtr += 32;
    
          }
    
        2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.

      64
          for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
    
      62
              const float r = inputVector[number] * scalar;
    
      62
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
    
                                                         const float* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int8_t* outputVectorPtr = outputVector;
    
      2
          const float min_val = INT8_MIN;
    
      2
          const float max_val = INT8_MAX;
    
      2
          const __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          const __m128 vmax_val = _mm_set_ps1(max_val);
    
      2
          const __m128 vScalar = _mm_set_ps1(scalar);
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (unsigned int number = 0; number < sixteenthPoints; number++) {
    
      16382
              __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
      16382
              __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
      16382
              __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
      16382
              __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
    
      16382
              inputVectorPtr += 4;
    
              inputVal1 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    
              inputVal2 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    
              inputVal3 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
    
              inputVal4 =
    
      49146
                  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
    
      16382
              __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
    
      16382
              __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
    
      16382
              __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
    
      16382
              __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
    
      16382
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
      16382
              intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
    
      16382
              intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
    
              _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      16382
              outputVectorPtr += 16;
    
          }
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
    
      30
              const float r = inputVector[number] * scalar;
    
      30
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
    
                                                        const float* inputVector,
    
                                                        const float scalar,
    
                                                        unsigned int num_points)
    
      {
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          const float* inputVectorPtr = (const float*)inputVector;
    
      2
          int8_t* outputVectorPtr = outputVector;
    
      2
          const float min_val = INT8_MIN;
    
      2
          const float max_val = INT8_MAX;
    
      2
          const __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          const __m128 vmax_val = _mm_set_ps1(max_val);
    
      2
          const __m128 vScalar = _mm_set_ps1(scalar);
    
          __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (unsigned int number = 0; number < quarterPoints; number++) {
    
      65534
              __m128 ret = _mm_load_ps(inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
      196602
              ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
    
              _mm_store_ps(outputFloatBuffer, ret);
    
        2/2✓ Branch 0 taken 262136 times.
✓ Branch 1 taken 65534 times.

      327670
              for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
    
      262136
                  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
    
              }
    
          }
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
    
      6
              const float r = inputVector[number] * scalar;
    
      6
              volk_32f_s32f_convert_8i_single(&outputVector[number], r);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_s32f_convert_8i
12			*
13			* \b Overview
14			*
15			* Converts a floating point number to a 8-bit int after applying a
16			* scaling factor.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
21			float scalar, unsigned int num_points)
22			* \endcode
23			*
24			* \b Inputs
25			* \li inputVector: the input vector of floats.
26			* \li scalar: The value multiplied against each point in the input buffer.
27			* \li num_points: The number of data points.
28			*
29			* \b Outputs
30			* \li outputVector: The output vector.
31			*
32			* \b Example
33			* Convert floats from [-1,1] to 8-bit integers with a scale of 5 to maintain smallest
34			delta
35			* int N = 10;
36			* unsigned int alignment = volk_get_alignment();
37			* float* increasing = (float)volk_malloc(sizeof(float)N, alignment);
38			* int8_t* out = (int8_t)volk_malloc(sizeof(int8_t)N, alignment);
39			*
40			* for(unsigned int ii = 0; ii < N; ++ii){
41			* increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
42			* }
43			*
44			* // Normalize by the smallest delta (0.2 in this example)
45			* // With float -> 8 bit ints be careful of scaling
46
47			* float scale = 5.1f;
48			*
49			* volk_32f_s32f_convert_8i(out, increasing, scale, N);
50			*
51			* for(unsigned int ii = 0; ii < N; ++ii){
52			* printf("out[%u] = %i\n", ii, out[ii]);
53			* }
54			*
55			* volk_free(increasing);
56			* volk_free(out);
57			* \endcode
58			*/
59
60			#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61			#define INCLUDED_volk_32f_s32f_convert_8i_u_H
62
63			#include <inttypes.h>
64
65		262338	static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
66			{
67		262338	const float min_val = INT8_MIN;
68		262338	const float max_val = INT8_MAX;
69	2/2 ✓ Branch 0 taken 79941 times. ✓ Branch 1 taken 182397 times.	262338	if (in > max_val) {
70		79941	*out = (int8_t)(max_val);
71	2/2 ✓ Branch 0 taken 79659 times. ✓ Branch 1 taken 102738 times.	182397	} else if (in < min_val) {
72		79659	*out = (int8_t)(min_val);
73			} else {
74		102738	*out = (int8_t)(rintf(in));
75			}
76		262338	}
77
78			#ifdef LV_HAVE_GENERIC
79
80		2	static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
81			const float* inputVector,
82			const float scalar,
83			unsigned int num_points)
84			{
85		2	const float* inputVectorPtr = inputVector;
86
87	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (unsigned int number = 0; number < num_points; number++) {
88		262142	const float r = inputVectorPtr++ scalar;
89		262142	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
90			}
91		2	}
92
93			#endif /* LV_HAVE_GENERIC */
94
95
96			#ifdef LV_HAVE_AVX2
97			#include <immintrin.h>
98
99		2	static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
100			const float* inputVector,
101			const float scalar,
102			unsigned int num_points)
103			{
104		2	const unsigned int thirtysecondPoints = num_points / 32;
105
106		2	const float* inputVectorPtr = (const float*)inputVector;
107		2	int8_t* outputVectorPtr = outputVector;
108
109		2	const float min_val = INT8_MIN;
110		2	const float max_val = INT8_MAX;
111		2	const __m256 vmin_val = _mm256_set1_ps(min_val);
112		2	const __m256 vmax_val = _mm256_set1_ps(max_val);
113
114		2	const __m256 vScalar = _mm256_set1_ps(scalar);
115
116	2/2 ✓ Branch 0 taken 8190 times. ✓ Branch 1 taken 2 times.	8192	for (unsigned int number = 0; number < thirtysecondPoints; number++) {
117		8190	__m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
118		8190	inputVectorPtr += 8;
119		8190	__m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
120		8190	inputVectorPtr += 8;
121		8190	__m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
122		8190	inputVectorPtr += 8;
123		8190	__m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
124		8190	inputVectorPtr += 8;
125
126		24570	inputVal1 = _mm256_max_ps(
127			_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
128		24570	inputVal2 = _mm256_max_ps(
129			_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
130		24570	inputVal3 = _mm256_max_ps(
131			_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
132		24570	inputVal4 = _mm256_max_ps(
133			_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
134
135		8190	__m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
136		8190	__m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
137		8190	__m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
138		8190	__m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
139
140		8190	intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
141		8190	intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
142		8190	intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
143		8190	intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
144
145		8190	intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
146		8190	const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
147
148			_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
149		8190	outputVectorPtr += 32;
150			}
151
152	2/2 ✓ Branch 0 taken 62 times. ✓ Branch 1 taken 2 times.	64	for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
153		62	float r = inputVector[number] * scalar;
154		62	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
155			}
156		2	}
157
158			#endif /* LV_HAVE_AVX2 */
159
160
161			#ifdef LV_HAVE_SSE2
162			#include <emmintrin.h>
163
164		2	static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
165			const float* inputVector,
166			const float scalar,
167			unsigned int num_points)
168			{
169		2	const unsigned int sixteenthPoints = num_points / 16;
170
171		2	const float* inputVectorPtr = (const float*)inputVector;
172		2	int8_t* outputVectorPtr = outputVector;
173
174		2	const float min_val = INT8_MIN;
175		2	const float max_val = INT8_MAX;
176		2	const __m128 vmin_val = _mm_set_ps1(min_val);
177		2	const __m128 vmax_val = _mm_set_ps1(max_val);
178
179		2	const __m128 vScalar = _mm_set_ps1(scalar);
180
181	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned int number = 0; number < sixteenthPoints; number++) {
182		16382	__m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
183		16382	inputVectorPtr += 4;
184		16382	__m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
185		16382	inputVectorPtr += 4;
186		16382	__m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
187		16382	inputVectorPtr += 4;
188		16382	__m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
189		16382	inputVectorPtr += 4;
190
191			inputVal1 =
192		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
193			inputVal2 =
194		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
195			inputVal3 =
196		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
197			inputVal4 =
198		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
199
200		16382	__m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
201		16382	__m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
202		16382	__m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
203		16382	__m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
204
205		16382	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
206		16382	intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
207
208		16382	intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
209
210			_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
211		16382	outputVectorPtr += 16;
212			}
213
214	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
215		30	const float r = inputVector[number] * scalar;
216		30	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
217			}
218		2	}
219
220			#endif /* LV_HAVE_SSE2 */
221
222
223			#ifdef LV_HAVE_SSE
224			#include <xmmintrin.h>
225
226		2	static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
227			const float* inputVector,
228			const float scalar,
229			unsigned int num_points)
230			{
231		2	const unsigned int quarterPoints = num_points / 4;
232
233		2	const float* inputVectorPtr = (const float*)inputVector;
234		2	int8_t* outputVectorPtr = outputVector;
235
236		2	const float min_val = INT8_MIN;
237		2	const float max_val = INT8_MAX;
238		2	const __m128 vmin_val = _mm_set_ps1(min_val);
239		2	const __m128 vmax_val = _mm_set_ps1(max_val);
240
241		2	const __m128 vScalar = _mm_set_ps1(scalar);
242
243			__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
244
245	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (unsigned int number = 0; number < quarterPoints; number++) {
246		65534	__m128 ret = _mm_loadu_ps(inputVectorPtr);
247		65534	inputVectorPtr += 4;
248
249		196602	ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
250
251			_mm_store_ps(outputFloatBuffer, ret);
252	2/2 ✓ Branch 0 taken 262136 times. ✓ Branch 1 taken 65534 times.	327670	for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
253		262136	*outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
254			}
255			}
256
257	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
258		6	const float r = inputVector[number] * scalar;
259		6	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
260			}
261		2	}
262
263			#endif /* LV_HAVE_SSE */
264
265
266			#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
267			#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
268			#define INCLUDED_volk_32f_s32f_convert_8i_a_H
269
270			#include <inttypes.h>
271
272			#ifdef LV_HAVE_AVX2
273			#include <immintrin.h>
274
275		2	static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
276			const float* inputVector,
277			const float scalar,
278			unsigned int num_points)
279			{
280		2	const unsigned int thirtysecondPoints = num_points / 32;
281
282		2	const float* inputVectorPtr = (const float*)inputVector;
283		2	int8_t* outputVectorPtr = outputVector;
284
285		2	const float min_val = INT8_MIN;
286		2	const float max_val = INT8_MAX;
287		2	const __m256 vmin_val = _mm256_set1_ps(min_val);
288		2	const __m256 vmax_val = _mm256_set1_ps(max_val);
289
290		2	const __m256 vScalar = _mm256_set1_ps(scalar);
291
292	2/2 ✓ Branch 0 taken 8190 times. ✓ Branch 1 taken 2 times.	8192	for (unsigned int number = 0; number < thirtysecondPoints; number++) {
293		8190	__m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
294		8190	inputVectorPtr += 8;
295		8190	__m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
296		8190	inputVectorPtr += 8;
297		8190	__m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
298		8190	inputVectorPtr += 8;
299		8190	__m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
300		8190	inputVectorPtr += 8;
301
302		24570	inputVal1 = _mm256_max_ps(
303			_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
304		24570	inputVal2 = _mm256_max_ps(
305			_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
306		24570	inputVal3 = _mm256_max_ps(
307			_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
308		32760	inputVal4 = _mm256_max_ps(
309			_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
310
311		8190	__m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
312		8190	__m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
313		8190	__m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
314		8190	__m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
315
316		8190	intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
317		8190	intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
318		8190	intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
319		8190	intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
320
321		8190	intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
322		8190	__m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
323
324			_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
325		8190	outputVectorPtr += 32;
326			}
327
328	2/2 ✓ Branch 0 taken 62 times. ✓ Branch 1 taken 2 times.	64	for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
329		62	const float r = inputVector[number] * scalar;
330		62	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
331			}
332		2	}
333
334			#endif /* LV_HAVE_AVX2 */
335
336
337			#ifdef LV_HAVE_SSE2
338			#include <emmintrin.h>
339
340		2	static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
341			const float* inputVector,
342			const float scalar,
343			unsigned int num_points)
344			{
345		2	const unsigned int sixteenthPoints = num_points / 16;
346
347		2	const float* inputVectorPtr = (const float*)inputVector;
348		2	int8_t* outputVectorPtr = outputVector;
349
350		2	const float min_val = INT8_MIN;
351		2	const float max_val = INT8_MAX;
352		2	const __m128 vmin_val = _mm_set_ps1(min_val);
353		2	const __m128 vmax_val = _mm_set_ps1(max_val);
354
355		2	const __m128 vScalar = _mm_set_ps1(scalar);
356
357	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (unsigned int number = 0; number < sixteenthPoints; number++) {
358		16382	__m128 inputVal1 = _mm_load_ps(inputVectorPtr);
359		16382	inputVectorPtr += 4;
360		16382	__m128 inputVal2 = _mm_load_ps(inputVectorPtr);
361		16382	inputVectorPtr += 4;
362		16382	__m128 inputVal3 = _mm_load_ps(inputVectorPtr);
363		16382	inputVectorPtr += 4;
364		16382	__m128 inputVal4 = _mm_load_ps(inputVectorPtr);
365		16382	inputVectorPtr += 4;
366
367			inputVal1 =
368		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
369			inputVal2 =
370		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
371			inputVal3 =
372		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
373			inputVal4 =
374		49146	_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
375
376		16382	__m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
377		16382	__m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
378		16382	__m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
379		16382	__m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
380
381		16382	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
382		16382	intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
383
384		16382	intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
385
386			_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
387		16382	outputVectorPtr += 16;
388			}
389
390	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
391		30	const float r = inputVector[number] * scalar;
392		30	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
393			}
394		2	}
395			#endif /* LV_HAVE_SSE2 */
396
397
398			#ifdef LV_HAVE_SSE
399			#include <xmmintrin.h>
400
401		2	static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
402			const float* inputVector,
403			const float scalar,
404			unsigned int num_points)
405			{
406		2	const unsigned int quarterPoints = num_points / 4;
407
408		2	const float* inputVectorPtr = (const float*)inputVector;
409		2	int8_t* outputVectorPtr = outputVector;
410
411		2	const float min_val = INT8_MIN;
412		2	const float max_val = INT8_MAX;
413		2	const __m128 vmin_val = _mm_set_ps1(min_val);
414		2	const __m128 vmax_val = _mm_set_ps1(max_val);
415
416		2	const __m128 vScalar = _mm_set_ps1(scalar);
417
418			__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
419
420	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (unsigned int number = 0; number < quarterPoints; number++) {
421		65534	__m128 ret = _mm_load_ps(inputVectorPtr);
422		65534	inputVectorPtr += 4;
423
424		196602	ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
425
426			_mm_store_ps(outputFloatBuffer, ret);
427	2/2 ✓ Branch 0 taken 262136 times. ✓ Branch 1 taken 65534 times.	327670	for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
428		262136	*outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
429			}
430			}
431
432	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
433		6	const float r = inputVector[number] * scalar;
434		6	volk_32f_s32f_convert_8i_single(&outputVector[number], r);
435			}
436		2	}
437
438			#endif /* LV_HAVE_SSE */
439
440
441			#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
442