GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_convert_16ic.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	117	127	92.1%
Functions:	5	5	100.0%
Branches:	28	38	73.7%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2016 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_convert_16ic
    
       *
    
       * \b Overview
    
       *
    
       * Converts a complex vector of 32-bits float each component into
    
       * a complex vector of 16-bits integer each component.
    
       * Values are saturated to the limit values of the output data type.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
    
       * unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li inputVector:  The complex 32-bit float input data buffer.
    
       * \li num_points:   The number of data values to be converted.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The complex 16-bit integer output data buffer.
    
       *
    
       */
    
      #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
    
      #define INCLUDED_volk_32fc_convert_16ic_a_H
    
      #include "volk/volk_complex.h"
    
      #include <limits.h>
    
      #include <math.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
    
                                                       const lv_32fc_t* inputVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          const unsigned int avx_iters = num_points / 8;
    
      2
          float* inputVectorPtr = (float*)inputVector;
    
      2
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
          float aux;
    
      2
          const float min_val = (float)SHRT_MIN;
    
      2
          const float max_val = (float)SHRT_MAX;
    
          __m256 inputVal1, inputVal2;
    
          __m256i intInputVal1, intInputVal2;
    
          __m256 ret1, ret2;
    
      2
          const __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          const __m256 vmax_val = _mm256_set1_ps(max_val);
    
          unsigned int i;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (i = 0; i < avx_iters; i++) {
    
      32766
              inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
    
      32766
              inputVectorPtr += 8;
    
      32766
              inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
    
      32766
              inputVectorPtr += 8;
    
      32766
              __VOLK_PREFETCH(inputVectorPtr + 16);
    
              // Clip
    
      65532
              ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
    
      65532
              ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
    
      32766
              intInputVal1 = _mm256_cvtps_epi32(ret1);
    
      32766
              intInputVal2 = _mm256_cvtps_epi32(ret2);
    
      32766
              intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
    
      32766
              intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
    
              _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
    
      32766
              outputVectorPtr += 16;
    
          }
    
        2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.

      30
          for (i = avx_iters * 16; i < num_points * 2; i++) {
    
      28
              aux = *inputVectorPtr++;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.

      28
              if (aux > max_val)
    
      ✗
                  aux = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.

      28
              else if (aux < min_val)
    
      ✗
                  aux = min_val;
    
      28
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
    
                                                       const lv_32fc_t* inputVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          const unsigned int sse_iters = num_points / 4;
    
      2
          float* inputVectorPtr = (float*)inputVector;
    
      2
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
          float aux;
    
      2
          const float min_val = (float)SHRT_MIN;
    
      2
          const float max_val = (float)SHRT_MAX;
    
          __m128 inputVal1, inputVal2;
    
          __m128i intInputVal1, intInputVal2;
    
          __m128 ret1, ret2;
    
      2
          const __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          const __m128 vmax_val = _mm_set_ps1(max_val);
    
          unsigned int i;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (i = 0; i < sse_iters; i++) {
    
      65534
              inputVal1 = _mm_load_ps((float*)inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
      65534
              inputVal2 = _mm_load_ps((float*)inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
      65534
              __VOLK_PREFETCH(inputVectorPtr + 8);
    
              // Clip
    
      131068
              ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
    
      131068
              ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
    
      65534
              intInputVal1 = _mm_cvtps_epi32(ret1);
    
      65534
              intInputVal2 = _mm_cvtps_epi32(ret2);
    
      65534
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
              _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      65534
              outputVectorPtr += 8;
    
          }
    
        2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 2 times.

      14
          for (i = sse_iters * 8; i < num_points * 2; i++) {
    
      12
              aux = *inputVectorPtr++;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
              if (aux > max_val)
    
      ✗
                  aux = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
              else if (aux < min_val)
    
      ✗
                  aux = min_val;
    
      12
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #if LV_HAVE_NEONV7
    
      #include <arm_neon.h>
    
      #define VCVTRQ_S32_F32(result, value)                                       \
    
          __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
    
          __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
    
          __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
    
          __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
    
      static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
    
                                                     const lv_32fc_t* inputVector,
    
                                                     unsigned int num_points)
    
      {
    
          const unsigned int neon_iters = num_points / 4;
    
          float32_t* inputVectorPtr = (float32_t*)inputVector;
    
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
          const float min_val_f = (float)SHRT_MIN;
    
          const float max_val_f = (float)SHRT_MAX;
    
          float32_t aux;
    
          unsigned int i;
    
          const float32x4_t min_val = vmovq_n_f32(min_val_f);
    
          const float32x4_t max_val = vmovq_n_f32(max_val_f);
    
          float32x4_t ret1, ret2, a, b;
    
          int32x4_t toint_a = { 0, 0, 0, 0 };
    
          int32x4_t toint_b = { 0, 0, 0, 0 };
    
          int16x4_t intInputVal1, intInputVal2;
    
          int16x8_t res;
    
          for (i = 0; i < neon_iters; i++) {
    
              a = vld1q_f32((const float32_t*)(inputVectorPtr));
    
              inputVectorPtr += 4;
    
              b = vld1q_f32((const float32_t*)(inputVectorPtr));
    
              inputVectorPtr += 4;
    
              __VOLK_PREFETCH(inputVectorPtr + 8);
    
              ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
    
              ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
    
              // vcvtr takes into account the current rounding mode (as does rintf)
    
              VCVTRQ_S32_F32(toint_a, ret1);
    
              VCVTRQ_S32_F32(toint_b, ret2);
    
              intInputVal1 = vqmovn_s32(toint_a);
    
              intInputVal2 = vqmovn_s32(toint_b);
    
              res = vcombine_s16(intInputVal1, intInputVal2);
    
              vst1q_s16((int16_t*)outputVectorPtr, res);
    
              outputVectorPtr += 8;
    
          }
    
          for (i = neon_iters * 8; i < num_points * 2; i++) {
    
              aux = *inputVectorPtr++;
    
              if (aux > max_val_f)
    
                  aux = max_val_f;
    
              else if (aux < min_val_f)
    
                  aux = min_val_f;
    
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      }
    
      #undef VCVTRQ_S32_F32
    
      #endif /* LV_HAVE_NEONV7 */
    
      #if LV_HAVE_NEONV8
    
      #include <arm_neon.h>
    
      static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
    
                                                       const lv_32fc_t* inputVector,
    
                                                       unsigned int num_points)
    
      {
    
          const unsigned int neon_iters = num_points / 4;
    
          float32_t* inputVectorPtr = (float32_t*)inputVector;
    
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
          const float min_val_f = (float)SHRT_MIN;
    
          const float max_val_f = (float)SHRT_MAX;
    
          float32_t aux;
    
          unsigned int i;
    
          const float32x4_t min_val = vmovq_n_f32(min_val_f);
    
          const float32x4_t max_val = vmovq_n_f32(max_val_f);
    
          float32x4_t ret1, ret2, a, b;
    
          int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
    
          int16x4_t intInputVal1, intInputVal2;
    
          int16x8_t res;
    
          for (i = 0; i < neon_iters; i++) {
    
              a = vld1q_f32((const float32_t*)(inputVectorPtr));
    
              inputVectorPtr += 4;
    
              b = vld1q_f32((const float32_t*)(inputVectorPtr));
    
              inputVectorPtr += 4;
    
              __VOLK_PREFETCH(inputVectorPtr + 8);
    
              ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
    
              ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
    
              // vrndiq takes into account the current rounding mode (as does rintf)
    
              toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
    
              toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
    
              intInputVal1 = vqmovn_s32(toint_a);
    
              intInputVal2 = vqmovn_s32(toint_b);
    
              res = vcombine_s16(intInputVal1, intInputVal2);
    
              vst1q_s16((int16_t*)outputVectorPtr, res);
    
              outputVectorPtr += 8;
    
          }
    
          for (i = neon_iters * 8; i < num_points * 2; i++) {
    
              aux = *inputVectorPtr++;
    
              if (aux > max_val_f)
    
                  aux = max_val_f;
    
              else if (aux < min_val_f)
    
                  aux = min_val_f;
    
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEONV8 */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
    
                                                        const lv_32fc_t* inputVector,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float* inputVectorPtr = (float*)inputVector;
    
      2
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
      2
          const float min_val = (float)SHRT_MIN;
    
      2
          const float max_val = (float)SHRT_MAX;
    
          float aux;
    
          unsigned int i;
    
        2/2✓ Branch 0 taken 524284 times.
✓ Branch 1 taken 2 times.

      524286
          for (i = 0; i < num_points * 2; i++) {
    
      524284
              aux = *inputVectorPtr++;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 524284 times.

      524284
              if (aux > max_val)
    
      ✗
                  aux = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 524284 times.

      524284
              else if (aux < min_val)
    
      ✗
                  aux = min_val;
    
      524284
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
    
      #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
    
      #define INCLUDED_volk_32fc_convert_16ic_u_H
    
      #include "volk/volk_complex.h"
    
      #include <limits.h>
    
      #include <math.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
    
                                                       const lv_32fc_t* inputVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          const unsigned int avx_iters = num_points / 8;
    
      2
          float* inputVectorPtr = (float*)inputVector;
    
      2
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
          float aux;
    
      2
          const float min_val = (float)SHRT_MIN;
    
      2
          const float max_val = (float)SHRT_MAX;
    
          __m256 inputVal1, inputVal2;
    
          __m256i intInputVal1, intInputVal2;
    
          __m256 ret1, ret2;
    
      2
          const __m256 vmin_val = _mm256_set1_ps(min_val);
    
      2
          const __m256 vmax_val = _mm256_set1_ps(max_val);
    
          unsigned int i;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (i = 0; i < avx_iters; i++) {
    
      32766
              inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
    
      32766
              inputVectorPtr += 8;
    
      32766
              inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
    
      32766
              inputVectorPtr += 8;
    
      32766
              __VOLK_PREFETCH(inputVectorPtr + 16);
    
              // Clip
    
      65532
              ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
    
      65532
              ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
    
      32766
              intInputVal1 = _mm256_cvtps_epi32(ret1);
    
      32766
              intInputVal2 = _mm256_cvtps_epi32(ret2);
    
      32766
              intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
    
      32766
              intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
    
              _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
    
      32766
              outputVectorPtr += 16;
    
          }
    
        2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.

      30
          for (i = avx_iters * 16; i < num_points * 2; i++) {
    
      28
              aux = *inputVectorPtr++;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.

      28
              if (aux > max_val)
    
      ✗
                  aux = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.

      28
              else if (aux < min_val)
    
      ✗
                  aux = min_val;
    
      28
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
    
                                                       const lv_32fc_t* inputVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          const unsigned int sse_iters = num_points / 4;
    
      2
          float* inputVectorPtr = (float*)inputVector;
    
      2
          int16_t* outputVectorPtr = (int16_t*)outputVector;
    
          float aux;
    
      2
          const float min_val = (float)SHRT_MIN;
    
      2
          const float max_val = (float)SHRT_MAX;
    
          __m128 inputVal1, inputVal2;
    
          __m128i intInputVal1, intInputVal2;
    
          __m128 ret1, ret2;
    
      2
          const __m128 vmin_val = _mm_set_ps1(min_val);
    
      2
          const __m128 vmax_val = _mm_set_ps1(max_val);
    
          unsigned int i;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (i = 0; i < sse_iters; i++) {
    
      65534
              inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
      65534
              inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
    
      65534
              inputVectorPtr += 4;
    
      65534
              __VOLK_PREFETCH(inputVectorPtr + 8);
    
              // Clip
    
      131068
              ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
    
      131068
              ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
    
      65534
              intInputVal1 = _mm_cvtps_epi32(ret1);
    
      65534
              intInputVal2 = _mm_cvtps_epi32(ret2);
    
      65534
              intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    
              _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
    
      65534
              outputVectorPtr += 8;
    
          }
    
        2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 2 times.

      14
          for (i = sse_iters * 8; i < num_points * 2; i++) {
    
      12
              aux = *inputVectorPtr++;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
              if (aux > max_val)
    
      ✗
                  aux = max_val;
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
              else if (aux < min_val)
    
      ✗
                  aux = min_val;
    
      12
              *outputVectorPtr++ = (int16_t)rintf(aux);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2016 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_convert_16ic
12			*
13			* \b Overview
14			*
15			* Converts a complex vector of 32-bits float each component into
16			* a complex vector of 16-bits integer each component.
17			* Values are saturated to the limit values of the output data type.
18			*
19			* <b>Dispatcher Prototype</b>
20			* \code
21			* void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
22			* unsigned int num_points); \endcode
23			*
24			* \b Inputs
25			* \li inputVector: The complex 32-bit float input data buffer.
26			* \li num_points: The number of data values to be converted.
27			*
28			* \b Outputs
29			* \li outputVector: The complex 16-bit integer output data buffer.
30			*
31			*/
32
33			#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34			#define INCLUDED_volk_32fc_convert_16ic_a_H
35
36			#include "volk/volk_complex.h"
37			#include <limits.h>
38			#include <math.h>
39
40			#ifdef LV_HAVE_AVX2
41			#include <immintrin.h>
42
43		2	static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
44			const lv_32fc_t* inputVector,
45			unsigned int num_points)
46			{
47		2	const unsigned int avx_iters = num_points / 8;
48
49		2	float* inputVectorPtr = (float*)inputVector;
50		2	int16_t* outputVectorPtr = (int16_t*)outputVector;
51			float aux;
52
53		2	const float min_val = (float)SHRT_MIN;
54		2	const float max_val = (float)SHRT_MAX;
55
56			__m256 inputVal1, inputVal2;
57			__m256i intInputVal1, intInputVal2;
58			__m256 ret1, ret2;
59		2	const __m256 vmin_val = _mm256_set1_ps(min_val);
60		2	const __m256 vmax_val = _mm256_set1_ps(max_val);
61			unsigned int i;
62
63	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (i = 0; i < avx_iters; i++) {
64		32766	inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
65		32766	inputVectorPtr += 8;
66		32766	inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
67		32766	inputVectorPtr += 8;
68		32766	__VOLK_PREFETCH(inputVectorPtr + 16);
69
70			// Clip
71		65532	ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72		65532	ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
73
74		32766	intInputVal1 = _mm256_cvtps_epi32(ret1);
75		32766	intInputVal2 = _mm256_cvtps_epi32(ret2);
76
77		32766	intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78		32766	intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
79
80			_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81		32766	outputVectorPtr += 16;
82			}
83
84	2/2 ✓ Branch 0 taken 28 times. ✓ Branch 1 taken 2 times.	30	for (i = avx_iters * 16; i < num_points * 2; i++) {
85		28	aux = *inputVectorPtr++;
86	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 28 times.	28	if (aux > max_val)
87		✗	aux = max_val;
88	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 28 times.	28	else if (aux < min_val)
89		✗	aux = min_val;
90		28	*outputVectorPtr++ = (int16_t)rintf(aux);
91			}
92		2	}
93			#endif /* LV_HAVE_AVX2 */
94
95			#ifdef LV_HAVE_SSE2
96			#include <emmintrin.h>
97
98		2	static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
99			const lv_32fc_t* inputVector,
100			unsigned int num_points)
101			{
102		2	const unsigned int sse_iters = num_points / 4;
103
104		2	float* inputVectorPtr = (float*)inputVector;
105		2	int16_t* outputVectorPtr = (int16_t*)outputVector;
106			float aux;
107
108		2	const float min_val = (float)SHRT_MIN;
109		2	const float max_val = (float)SHRT_MAX;
110
111			__m128 inputVal1, inputVal2;
112			__m128i intInputVal1, intInputVal2;
113			__m128 ret1, ret2;
114		2	const __m128 vmin_val = _mm_set_ps1(min_val);
115		2	const __m128 vmax_val = _mm_set_ps1(max_val);
116			unsigned int i;
117
118	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (i = 0; i < sse_iters; i++) {
119		65534	inputVal1 = _mm_load_ps((float*)inputVectorPtr);
120		65534	inputVectorPtr += 4;
121		65534	inputVal2 = _mm_load_ps((float*)inputVectorPtr);
122		65534	inputVectorPtr += 4;
123		65534	__VOLK_PREFETCH(inputVectorPtr + 8);
124
125			// Clip
126		131068	ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
127		131068	ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
128
129		65534	intInputVal1 = _mm_cvtps_epi32(ret1);
130		65534	intInputVal2 = _mm_cvtps_epi32(ret2);
131
132		65534	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
133
134			_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
135		65534	outputVectorPtr += 8;
136			}
137
138	2/2 ✓ Branch 0 taken 12 times. ✓ Branch 1 taken 2 times.	14	for (i = sse_iters * 8; i < num_points * 2; i++) {
139		12	aux = *inputVectorPtr++;
140	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	if (aux > max_val)
141		✗	aux = max_val;
142	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	else if (aux < min_val)
143		✗	aux = min_val;
144		12	*outputVectorPtr++ = (int16_t)rintf(aux);
145			}
146		2	}
147			#endif /* LV_HAVE_SSE2 */
148
149
150			#if LV_HAVE_NEONV7
151			#include <arm_neon.h>
152
153			#define VCVTRQ_S32_F32(result, value) \
154			__VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
155			__VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
156			__VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
157			__VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
158
159			static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
160			const lv_32fc_t* inputVector,
161			unsigned int num_points)
162			{
163
164			const unsigned int neon_iters = num_points / 4;
165
166			float32_t* inputVectorPtr = (float32_t*)inputVector;
167			int16_t* outputVectorPtr = (int16_t*)outputVector;
168
169			const float min_val_f = (float)SHRT_MIN;
170			const float max_val_f = (float)SHRT_MAX;
171			float32_t aux;
172			unsigned int i;
173
174			const float32x4_t min_val = vmovq_n_f32(min_val_f);
175			const float32x4_t max_val = vmovq_n_f32(max_val_f);
176			float32x4_t ret1, ret2, a, b;
177
178			int32x4_t toint_a = { 0, 0, 0, 0 };
179			int32x4_t toint_b = { 0, 0, 0, 0 };
180			int16x4_t intInputVal1, intInputVal2;
181			int16x8_t res;
182
183			for (i = 0; i < neon_iters; i++) {
184			a = vld1q_f32((const float32_t*)(inputVectorPtr));
185			inputVectorPtr += 4;
186			b = vld1q_f32((const float32_t*)(inputVectorPtr));
187			inputVectorPtr += 4;
188			__VOLK_PREFETCH(inputVectorPtr + 8);
189
190			ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
191			ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
192
193			// vcvtr takes into account the current rounding mode (as does rintf)
194			VCVTRQ_S32_F32(toint_a, ret1);
195			VCVTRQ_S32_F32(toint_b, ret2);
196
197			intInputVal1 = vqmovn_s32(toint_a);
198			intInputVal2 = vqmovn_s32(toint_b);
199
200			res = vcombine_s16(intInputVal1, intInputVal2);
201			vst1q_s16((int16_t*)outputVectorPtr, res);
202			outputVectorPtr += 8;
203			}
204
205			for (i = neon_iters * 8; i < num_points * 2; i++) {
206			aux = *inputVectorPtr++;
207			if (aux > max_val_f)
208			aux = max_val_f;
209			else if (aux < min_val_f)
210			aux = min_val_f;
211			*outputVectorPtr++ = (int16_t)rintf(aux);
212			}
213			}
214
215			#undef VCVTRQ_S32_F32
216			#endif /* LV_HAVE_NEONV7 */
217
218			#if LV_HAVE_NEONV8
219			#include <arm_neon.h>
220
221			static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
222			const lv_32fc_t* inputVector,
223			unsigned int num_points)
224			{
225			const unsigned int neon_iters = num_points / 4;
226
227			float32_t* inputVectorPtr = (float32_t*)inputVector;
228			int16_t* outputVectorPtr = (int16_t*)outputVector;
229
230			const float min_val_f = (float)SHRT_MIN;
231			const float max_val_f = (float)SHRT_MAX;
232			float32_t aux;
233			unsigned int i;
234
235			const float32x4_t min_val = vmovq_n_f32(min_val_f);
236			const float32x4_t max_val = vmovq_n_f32(max_val_f);
237			float32x4_t ret1, ret2, a, b;
238
239			int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240			int16x4_t intInputVal1, intInputVal2;
241			int16x8_t res;
242
243			for (i = 0; i < neon_iters; i++) {
244			a = vld1q_f32((const float32_t*)(inputVectorPtr));
245			inputVectorPtr += 4;
246			b = vld1q_f32((const float32_t*)(inputVectorPtr));
247			inputVectorPtr += 4;
248			__VOLK_PREFETCH(inputVectorPtr + 8);
249
250			ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251			ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
252
253			// vrndiq takes into account the current rounding mode (as does rintf)
254			toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255			toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
256
257			intInputVal1 = vqmovn_s32(toint_a);
258			intInputVal2 = vqmovn_s32(toint_b);
259
260			res = vcombine_s16(intInputVal1, intInputVal2);
261			vst1q_s16((int16_t*)outputVectorPtr, res);
262			outputVectorPtr += 8;
263			}
264
265			for (i = neon_iters * 8; i < num_points * 2; i++) {
266			aux = *inputVectorPtr++;
267			if (aux > max_val_f)
268			aux = max_val_f;
269			else if (aux < min_val_f)
270			aux = min_val_f;
271			*outputVectorPtr++ = (int16_t)rintf(aux);
272			}
273			}
274			#endif /* LV_HAVE_NEONV8 */
275
276
277			#ifdef LV_HAVE_GENERIC
278
279		2	static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
280			const lv_32fc_t* inputVector,
281			unsigned int num_points)
282			{
283		2	float* inputVectorPtr = (float*)inputVector;
284		2	int16_t* outputVectorPtr = (int16_t*)outputVector;
285		2	const float min_val = (float)SHRT_MIN;
286		2	const float max_val = (float)SHRT_MAX;
287			float aux;
288			unsigned int i;
289	2/2 ✓ Branch 0 taken 524284 times. ✓ Branch 1 taken 2 times.	524286	for (i = 0; i < num_points * 2; i++) {
290		524284	aux = *inputVectorPtr++;
291	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 524284 times.	524284	if (aux > max_val)
292		✗	aux = max_val;
293	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 524284 times.	524284	else if (aux < min_val)
294		✗	aux = min_val;
295		524284	*outputVectorPtr++ = (int16_t)rintf(aux);
296			}
297		2	}
298			#endif /* LV_HAVE_GENERIC */
299
300			#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
301
302			#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303			#define INCLUDED_volk_32fc_convert_16ic_u_H
304
305			#include "volk/volk_complex.h"
306			#include <limits.h>
307			#include <math.h>
308
309
310			#ifdef LV_HAVE_AVX2
311			#include <immintrin.h>
312
313		2	static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
314			const lv_32fc_t* inputVector,
315			unsigned int num_points)
316			{
317		2	const unsigned int avx_iters = num_points / 8;
318
319		2	float* inputVectorPtr = (float*)inputVector;
320		2	int16_t* outputVectorPtr = (int16_t*)outputVector;
321			float aux;
322
323		2	const float min_val = (float)SHRT_MIN;
324		2	const float max_val = (float)SHRT_MAX;
325
326			__m256 inputVal1, inputVal2;
327			__m256i intInputVal1, intInputVal2;
328			__m256 ret1, ret2;
329		2	const __m256 vmin_val = _mm256_set1_ps(min_val);
330		2	const __m256 vmax_val = _mm256_set1_ps(max_val);
331			unsigned int i;
332
333	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (i = 0; i < avx_iters; i++) {
334		32766	inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
335		32766	inputVectorPtr += 8;
336		32766	inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
337		32766	inputVectorPtr += 8;
338		32766	__VOLK_PREFETCH(inputVectorPtr + 16);
339
340			// Clip
341		65532	ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342		65532	ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
343
344		32766	intInputVal1 = _mm256_cvtps_epi32(ret1);
345		32766	intInputVal2 = _mm256_cvtps_epi32(ret2);
346
347		32766	intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348		32766	intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
349
350			_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351		32766	outputVectorPtr += 16;
352			}
353
354	2/2 ✓ Branch 0 taken 28 times. ✓ Branch 1 taken 2 times.	30	for (i = avx_iters * 16; i < num_points * 2; i++) {
355		28	aux = *inputVectorPtr++;
356	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 28 times.	28	if (aux > max_val)
357		✗	aux = max_val;
358	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 28 times.	28	else if (aux < min_val)
359		✗	aux = min_val;
360		28	*outputVectorPtr++ = (int16_t)rintf(aux);
361			}
362		2	}
363			#endif /* LV_HAVE_AVX2 */
364
365
366			#ifdef LV_HAVE_SSE2
367			#include <emmintrin.h>
368
369		2	static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
370			const lv_32fc_t* inputVector,
371			unsigned int num_points)
372			{
373		2	const unsigned int sse_iters = num_points / 4;
374
375		2	float* inputVectorPtr = (float*)inputVector;
376		2	int16_t* outputVectorPtr = (int16_t*)outputVector;
377			float aux;
378
379		2	const float min_val = (float)SHRT_MIN;
380		2	const float max_val = (float)SHRT_MAX;
381
382			__m128 inputVal1, inputVal2;
383			__m128i intInputVal1, intInputVal2;
384			__m128 ret1, ret2;
385		2	const __m128 vmin_val = _mm_set_ps1(min_val);
386		2	const __m128 vmax_val = _mm_set_ps1(max_val);
387
388			unsigned int i;
389	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (i = 0; i < sse_iters; i++) {
390		65534	inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
391		65534	inputVectorPtr += 4;
392		65534	inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
393		65534	inputVectorPtr += 4;
394		65534	__VOLK_PREFETCH(inputVectorPtr + 8);
395
396			// Clip
397		131068	ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
398		131068	ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
399
400		65534	intInputVal1 = _mm_cvtps_epi32(ret1);
401		65534	intInputVal2 = _mm_cvtps_epi32(ret2);
402
403		65534	intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
404
405			_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
406		65534	outputVectorPtr += 8;
407			}
408
409	2/2 ✓ Branch 0 taken 12 times. ✓ Branch 1 taken 2 times.	14	for (i = sse_iters * 8; i < num_points * 2; i++) {
410		12	aux = *inputVectorPtr++;
411	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	if (aux > max_val)
412		✗	aux = max_val;
413	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	else if (aux < min_val)
414		✗	aux = min_val;
415		12	*outputVectorPtr++ = (int16_t)rintf(aux);
416			}
417		2	}
418			#endif /* LV_HAVE_SSE2 */
419			#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
420