GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_8i_s32f_convert_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	132	132	100.0%
Functions:	7	7	100.0%
Branches:	20	20	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_8i_s32f_convert_32f
    
       *
    
       * \b Overview
    
       *
    
       * Convert the input vector of 8-bit chars to a vector of floats. The
    
       * floats are then divided by the scalar factor.  shorts.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
    
       * float scalar, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li inputVector: The input vector of 8-bit chars.
    
       * \li scalar: the scaling factor used to divide the results of the conversion.
    
       * \li num_points: The number of values.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output 16-bit shorts.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * volk_8i_s32f_convert_32f();
    
       *
    
       * volk_free(x);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
    
      #define INCLUDED_volk_8i_s32f_convert_32f_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
    
                                                         const int8_t* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          float* outputVectorPtr = outputVector;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m256 invScalar = _mm256_set1_ps(iScalar);
    
      2
          const int8_t* inputVectorPtr = inputVector;
    
          __m256 ret;
    
          __m128i inputVal128;
    
          __m256i interimVal;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
    
      16382
              interimVal = _mm256_cvtepi8_epi32(inputVal128);
    
      16382
              ret = _mm256_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm256_mul_ps(ret, invScalar);
    
              _mm256_storeu_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 8;
    
      16382
              inputVal128 = _mm_srli_si128(inputVal128, 8);
    
      16382
              interimVal = _mm256_cvtepi8_epi32(inputVal128);
    
      16382
              ret = _mm256_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm256_mul_ps(ret, invScalar);
    
              _mm256_storeu_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 8;
    
      16382
              inputVectorPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              outputVector[number] = (float)(inputVector[number]) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
    
                                                           const int8_t* inputVector,
    
                                                           const float scalar,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          float* outputVectorPtr = outputVector;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m128 invScalar = _mm_set_ps1(iScalar);
    
      2
          const int8_t* inputVectorPtr = inputVector;
    
          __m128 ret;
    
          __m128i inputVal;
    
          __m128i interimVal;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_storeu_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVal = _mm_srli_si128(inputVal, 4);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_storeu_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVal = _mm_srli_si128(inputVal, 4);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_storeu_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVal = _mm_srli_si128(inputVal, 4);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_storeu_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVectorPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              outputVector[number] = (float)(inputVector[number]) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
    
                                                          const int8_t* inputVector,
    
                                                          const float scalar,
    
                                                          unsigned int num_points)
    
      {
    
      2
          float* outputVectorPtr = outputVector;
    
      2
          const int8_t* inputVectorPtr = inputVector;
    
      2
          unsigned int number = 0;
    
      2
          const float iScalar = 1.0 / scalar;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
    
      #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
    
      #define INCLUDED_volk_8i_s32f_convert_32f_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
    
                                                         const int8_t* inputVector,
    
                                                         const float scalar,
    
                                                         unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          float* outputVectorPtr = outputVector;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m256 invScalar = _mm256_set1_ps(iScalar);
    
      2
          const int8_t* inputVectorPtr = inputVector;
    
          __m256 ret;
    
          __m128i inputVal128;
    
          __m256i interimVal;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
    
      16382
              interimVal = _mm256_cvtepi8_epi32(inputVal128);
    
      16382
              ret = _mm256_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm256_mul_ps(ret, invScalar);
    
              _mm256_store_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 8;
    
      16382
              inputVal128 = _mm_srli_si128(inputVal128, 8);
    
      16382
              interimVal = _mm256_cvtepi8_epi32(inputVal128);
    
      16382
              ret = _mm256_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm256_mul_ps(ret, invScalar);
    
              _mm256_store_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 8;
    
      16382
              inputVectorPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              outputVector[number] = (float)(inputVector[number]) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
    
                                                           const int8_t* inputVector,
    
                                                           const float scalar,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
      2
          float* outputVectorPtr = outputVector;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m128 invScalar = _mm_set_ps1(iScalar);
    
      2
          const int8_t* inputVectorPtr = inputVector;
    
          __m128 ret;
    
          __m128i inputVal;
    
          __m128i interimVal;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_store_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVal = _mm_srli_si128(inputVal, 4);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_store_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVal = _mm_srli_si128(inputVal, 4);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_store_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVal = _mm_srli_si128(inputVal, 4);
    
      16382
              interimVal = _mm_cvtepi8_epi32(inputVal);
    
      16382
              ret = _mm_cvtepi32_ps(interimVal);
    
      16382
              ret = _mm_mul_ps(ret, invScalar);
    
              _mm_store_ps(outputVectorPtr, ret);
    
      16382
              outputVectorPtr += 4;
    
      16382
              inputVectorPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              outputVector[number] = (float)(inputVector[number]) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
    
                                                       const int8_t* inputVector,
    
                                                       const float scalar,
    
                                                       unsigned int num_points)
    
      {
    
          float* outputVectorPtr = outputVector;
    
          const int8_t* inputVectorPtr = inputVector;
    
          const float iScalar = 1.0 / scalar;
    
          const float32x4_t qiScalar = vdupq_n_f32(iScalar);
    
          int8x16_t inputVal;
    
          int16x8_t lower;
    
          int16x8_t higher;
    
          float32x4_t outputFloat;
    
          unsigned int number = 0;
    
          const unsigned int sixteenthPoints = num_points / 16;
    
          for (; number < sixteenthPoints; number++) {
    
              inputVal = vld1q_s8(inputVectorPtr);
    
              inputVectorPtr += 16;
    
              lower = vmovl_s8(vget_low_s8(inputVal));
    
              higher = vmovl_s8(vget_high_s8(inputVal));
    
              outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
    
              vst1q_f32(outputVectorPtr, outputFloat);
    
              outputVectorPtr += 4;
    
              outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
    
              vst1q_f32(outputVectorPtr, outputFloat);
    
              outputVectorPtr += 4;
    
              outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
    
              vst1q_f32(outputVectorPtr, outputFloat);
    
              outputVectorPtr += 4;
    
              outputFloat =
    
                  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
    
              vst1q_f32(outputVectorPtr, outputFloat);
    
              outputVectorPtr += 4;
    
          }
    
          for (number = sixteenthPoints * 16; number < num_points; number++) {
    
              *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
    
                                                            const int8_t* inputVector,
    
                                                            const float scalar,
    
                                                            unsigned int num_points)
    
      {
    
      2
          float* outputVectorPtr = outputVector;
    
      2
          const int8_t* inputVectorPtr = inputVector;
    
      2
          unsigned int number = 0;
    
      2
          const float iScalar = 1.0 / scalar;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
    
                                                      const int8_t* inputVector,
    
                                                      const float scalar,
    
                                                      unsigned int num_points);
    
      2
      static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
    
                                                        const int8_t* inputVector,
    
                                                        const float scalar,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float invscalar = 1.0 / scalar;
    
      2
          volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_ORC */
    
      #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_8i_s32f_convert_32f
12			*
13			* \b Overview
14			*
15			* Convert the input vector of 8-bit chars to a vector of floats. The
16			* floats are then divided by the scalar factor. shorts.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
21			* float scalar, unsigned int num_points) \endcode
22			*
23			* \b Inputs
24			* \li inputVector: The input vector of 8-bit chars.
25			* \li scalar: the scaling factor used to divide the results of the conversion.
26			* \li num_points: The number of values.
27			*
28			* \b Outputs
29			* \li outputVector: The output 16-bit shorts.
30			*
31			* \b Example
32			* \code
33			* int N = 10000;
34			*
35			* volk_8i_s32f_convert_32f();
36			*
37			* volk_free(x);
38			* \endcode
39			*/
40
41			#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42			#define INCLUDED_volk_8i_s32f_convert_32f_u_H
43
44			#include <inttypes.h>
45			#include <stdio.h>
46
47			#ifdef LV_HAVE_AVX2
48			#include <immintrin.h>
49
50		2	static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51			const int8_t* inputVector,
52			const float scalar,
53			unsigned int num_points)
54			{
55		2	unsigned int number = 0;
56		2	const unsigned int sixteenthPoints = num_points / 16;
57
58		2	float* outputVectorPtr = outputVector;
59		2	const float iScalar = 1.0 / scalar;
60		2	__m256 invScalar = _mm256_set1_ps(iScalar);
61		2	const int8_t* inputVectorPtr = inputVector;
62			__m256 ret;
63			__m128i inputVal128;
64			__m256i interimVal;
65
66	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
67		16382	inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68
69		16382	interimVal = _mm256_cvtepi8_epi32(inputVal128);
70		16382	ret = _mm256_cvtepi32_ps(interimVal);
71		16382	ret = _mm256_mul_ps(ret, invScalar);
72			_mm256_storeu_ps(outputVectorPtr, ret);
73		16382	outputVectorPtr += 8;
74
75		16382	inputVal128 = _mm_srli_si128(inputVal128, 8);
76		16382	interimVal = _mm256_cvtepi8_epi32(inputVal128);
77		16382	ret = _mm256_cvtepi32_ps(interimVal);
78		16382	ret = _mm256_mul_ps(ret, invScalar);
79			_mm256_storeu_ps(outputVectorPtr, ret);
80		16382	outputVectorPtr += 8;
81
82		16382	inputVectorPtr += 16;
83			}
84
85		2	number = sixteenthPoints * 16;
86	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
87		30	outputVector[number] = (float)(inputVector[number]) * iScalar;
88			}
89		2	}
90			#endif /* LV_HAVE_AVX2 */
91
92
93			#ifdef LV_HAVE_SSE4_1
94			#include <smmintrin.h>
95
96		2	static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
97			const int8_t* inputVector,
98			const float scalar,
99			unsigned int num_points)
100			{
101		2	unsigned int number = 0;
102		2	const unsigned int sixteenthPoints = num_points / 16;
103
104		2	float* outputVectorPtr = outputVector;
105		2	const float iScalar = 1.0 / scalar;
106		2	__m128 invScalar = _mm_set_ps1(iScalar);
107		2	const int8_t* inputVectorPtr = inputVector;
108			__m128 ret;
109			__m128i inputVal;
110			__m128i interimVal;
111
112	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
113		16382	inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
114
115		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
116		16382	ret = _mm_cvtepi32_ps(interimVal);
117		16382	ret = _mm_mul_ps(ret, invScalar);
118			_mm_storeu_ps(outputVectorPtr, ret);
119		16382	outputVectorPtr += 4;
120
121		16382	inputVal = _mm_srli_si128(inputVal, 4);
122		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
123		16382	ret = _mm_cvtepi32_ps(interimVal);
124		16382	ret = _mm_mul_ps(ret, invScalar);
125			_mm_storeu_ps(outputVectorPtr, ret);
126		16382	outputVectorPtr += 4;
127
128		16382	inputVal = _mm_srli_si128(inputVal, 4);
129		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
130		16382	ret = _mm_cvtepi32_ps(interimVal);
131		16382	ret = _mm_mul_ps(ret, invScalar);
132			_mm_storeu_ps(outputVectorPtr, ret);
133		16382	outputVectorPtr += 4;
134
135		16382	inputVal = _mm_srli_si128(inputVal, 4);
136		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
137		16382	ret = _mm_cvtepi32_ps(interimVal);
138		16382	ret = _mm_mul_ps(ret, invScalar);
139			_mm_storeu_ps(outputVectorPtr, ret);
140		16382	outputVectorPtr += 4;
141
142		16382	inputVectorPtr += 16;
143			}
144
145		2	number = sixteenthPoints * 16;
146	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
147		30	outputVector[number] = (float)(inputVector[number]) * iScalar;
148			}
149		2	}
150			#endif /* LV_HAVE_SSE4_1 */
151
152			#ifdef LV_HAVE_GENERIC
153
154		2	static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
155			const int8_t* inputVector,
156			const float scalar,
157			unsigned int num_points)
158			{
159		2	float* outputVectorPtr = outputVector;
160		2	const int8_t* inputVectorPtr = inputVector;
161		2	unsigned int number = 0;
162		2	const float iScalar = 1.0 / scalar;
163
164	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
165		262142	outputVectorPtr++ = ((float)(inputVectorPtr++)) * iScalar;
166			}
167		2	}
168			#endif /* LV_HAVE_GENERIC */
169
170
171			#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
172
173			#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174			#define INCLUDED_volk_8i_s32f_convert_32f_a_H
175
176			#include <inttypes.h>
177			#include <stdio.h>
178
179			#ifdef LV_HAVE_AVX2
180			#include <immintrin.h>
181
182		2	static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
183			const int8_t* inputVector,
184			const float scalar,
185			unsigned int num_points)
186			{
187		2	unsigned int number = 0;
188		2	const unsigned int sixteenthPoints = num_points / 16;
189
190		2	float* outputVectorPtr = outputVector;
191		2	const float iScalar = 1.0 / scalar;
192		2	__m256 invScalar = _mm256_set1_ps(iScalar);
193		2	const int8_t* inputVectorPtr = inputVector;
194			__m256 ret;
195			__m128i inputVal128;
196			__m256i interimVal;
197
198	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
199		16382	inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
200
201		16382	interimVal = _mm256_cvtepi8_epi32(inputVal128);
202		16382	ret = _mm256_cvtepi32_ps(interimVal);
203		16382	ret = _mm256_mul_ps(ret, invScalar);
204			_mm256_store_ps(outputVectorPtr, ret);
205		16382	outputVectorPtr += 8;
206
207		16382	inputVal128 = _mm_srli_si128(inputVal128, 8);
208		16382	interimVal = _mm256_cvtepi8_epi32(inputVal128);
209		16382	ret = _mm256_cvtepi32_ps(interimVal);
210		16382	ret = _mm256_mul_ps(ret, invScalar);
211			_mm256_store_ps(outputVectorPtr, ret);
212		16382	outputVectorPtr += 8;
213
214		16382	inputVectorPtr += 16;
215			}
216
217		2	number = sixteenthPoints * 16;
218	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
219		30	outputVector[number] = (float)(inputVector[number]) * iScalar;
220			}
221		2	}
222			#endif /* LV_HAVE_AVX2 */
223
224			#ifdef LV_HAVE_SSE4_1
225			#include <smmintrin.h>
226
227		2	static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
228			const int8_t* inputVector,
229			const float scalar,
230			unsigned int num_points)
231			{
232		2	unsigned int number = 0;
233		2	const unsigned int sixteenthPoints = num_points / 16;
234
235		2	float* outputVectorPtr = outputVector;
236		2	const float iScalar = 1.0 / scalar;
237		2	__m128 invScalar = _mm_set_ps1(iScalar);
238		2	const int8_t* inputVectorPtr = inputVector;
239			__m128 ret;
240			__m128i inputVal;
241			__m128i interimVal;
242
243	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
244		16382	inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
245
246		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
247		16382	ret = _mm_cvtepi32_ps(interimVal);
248		16382	ret = _mm_mul_ps(ret, invScalar);
249			_mm_store_ps(outputVectorPtr, ret);
250		16382	outputVectorPtr += 4;
251
252		16382	inputVal = _mm_srli_si128(inputVal, 4);
253		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
254		16382	ret = _mm_cvtepi32_ps(interimVal);
255		16382	ret = _mm_mul_ps(ret, invScalar);
256			_mm_store_ps(outputVectorPtr, ret);
257		16382	outputVectorPtr += 4;
258
259		16382	inputVal = _mm_srli_si128(inputVal, 4);
260		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
261		16382	ret = _mm_cvtepi32_ps(interimVal);
262		16382	ret = _mm_mul_ps(ret, invScalar);
263			_mm_store_ps(outputVectorPtr, ret);
264		16382	outputVectorPtr += 4;
265
266		16382	inputVal = _mm_srli_si128(inputVal, 4);
267		16382	interimVal = _mm_cvtepi8_epi32(inputVal);
268		16382	ret = _mm_cvtepi32_ps(interimVal);
269		16382	ret = _mm_mul_ps(ret, invScalar);
270			_mm_store_ps(outputVectorPtr, ret);
271		16382	outputVectorPtr += 4;
272
273		16382	inputVectorPtr += 16;
274			}
275
276		2	number = sixteenthPoints * 16;
277	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
278		30	outputVector[number] = (float)(inputVector[number]) * iScalar;
279			}
280		2	}
281			#endif /* LV_HAVE_SSE4_1 */
282
283			#ifdef LV_HAVE_NEON
284			#include <arm_neon.h>
285
286			static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
287			const int8_t* inputVector,
288			const float scalar,
289			unsigned int num_points)
290			{
291			float* outputVectorPtr = outputVector;
292			const int8_t* inputVectorPtr = inputVector;
293
294			const float iScalar = 1.0 / scalar;
295			const float32x4_t qiScalar = vdupq_n_f32(iScalar);
296
297			int8x16_t inputVal;
298
299			int16x8_t lower;
300			int16x8_t higher;
301
302			float32x4_t outputFloat;
303
304			unsigned int number = 0;
305			const unsigned int sixteenthPoints = num_points / 16;
306			for (; number < sixteenthPoints; number++) {
307			inputVal = vld1q_s8(inputVectorPtr);
308			inputVectorPtr += 16;
309
310			lower = vmovl_s8(vget_low_s8(inputVal));
311			higher = vmovl_s8(vget_high_s8(inputVal));
312
313			outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314			vst1q_f32(outputVectorPtr, outputFloat);
315			outputVectorPtr += 4;
316
317			outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318			vst1q_f32(outputVectorPtr, outputFloat);
319			outputVectorPtr += 4;
320
321			outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322			vst1q_f32(outputVectorPtr, outputFloat);
323			outputVectorPtr += 4;
324
325			outputFloat =
326			vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327			vst1q_f32(outputVectorPtr, outputFloat);
328			outputVectorPtr += 4;
329			}
330			for (number = sixteenthPoints * 16; number < num_points; number++) {
331			outputVectorPtr++ = ((float)(inputVectorPtr++)) * iScalar;
332			}
333			}
334
335			#endif /* LV_HAVE_NEON */
336
337			#ifdef LV_HAVE_GENERIC
338
339		2	static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
340			const int8_t* inputVector,
341			const float scalar,
342			unsigned int num_points)
343			{
344		2	float* outputVectorPtr = outputVector;
345		2	const int8_t* inputVectorPtr = inputVector;
346		2	unsigned int number = 0;
347		2	const float iScalar = 1.0 / scalar;
348
349	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
350		262142	outputVectorPtr++ = ((float)(inputVectorPtr++)) * iScalar;
351			}
352		2	}
353			#endif /* LV_HAVE_GENERIC */
354
355
356			#ifdef LV_HAVE_ORC
357			extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
358			const int8_t* inputVector,
359			const float scalar,
360			unsigned int num_points);
361
362		2	static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
363			const int8_t* inputVector,
364			const float scalar,
365			unsigned int num_points)
366			{
367		2	float invscalar = 1.0 / scalar;
368		2	volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
369		2	}
370			#endif /* LV_HAVE_ORC */
371
372
373			#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
374