GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	104	104	100.0%
Functions:	5	5	100.0%
Branches:	14	14	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_16ic_s32f_deinterleave_32f_x2
    
       *
    
       * \b Overview
    
       *
    
       * Deinterleaves the complex 16 bit vector into I & Q vector data and
    
       * returns the result as two vectors of floats that have been scaled.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       *  void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const
    
       * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode
    
       *
    
       * \b Inputs
    
       * \li complexVector: The complex input vector of 16-bit shorts.
    
       * \li scalar: The value to be divided against each sample of the input complex vector.
    
       * \li num_points: The number of complex data values to be deinterleaved.
    
       *
    
       * \b Outputs
    
       * \li iBuffer: The floating point I buffer output data.
    
       * \li qBuffer: The floating point Q buffer output data.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * volk_16ic_s32f_deinterleave_32f_x2();
    
       *
    
       * volk_free(x);
    
       * volk_free(t);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
    
      #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
    
                                                float* qBuffer,
    
                                                const lv_16sc_t* complexVector,
    
                                                const float scalar,
    
                                                unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          uint64_t number = 0;
    
      2
          const uint64_t eighthPoints = num_points / 8;
    
          __m256 cplxValue1, cplxValue2, iValue, qValue;
    
          __m256i cplxValueA, cplxValueB;
    
          __m128i cplxValue128;
    
      2
          __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
    
      2
          int16_t* complexVectorPtr = (int16_t*)complexVector;
    
      2
          __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
    
      32766
              complexVectorPtr += 16;
    
              // cvt
    
      32766
              cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
    
      32766
              cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
    
      32766
              cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
    
      32766
              cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
    
      32766
              cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
    
      32766
              cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
    
      32766
              cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
    
      32766
              cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
    
              // Arrange in i1i2i3i4 format
    
      32766
              iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
    
      32766
              iValue = _mm256_permutevar8x32_ps(iValue, idx);
    
              // Arrange in q1q2q3q4 format
    
      32766
              qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
    
      32766
              qValue = _mm256_permutevar8x32_ps(qValue, idx);
    
              _mm256_store_ps(iBufferPtr, iValue);
    
              _mm256_store_ps(qBufferPtr, qValue);
    
      32766
              iBufferPtr += 8;
    
      32766
              qBufferPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          complexVectorPtr = (int16_t*)&complexVector[number];
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
      14
              *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      static inline void
    
      2
      volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
    
                                               float* qBuffer,
    
                                               const lv_16sc_t* complexVector,
    
                                               const float scalar,
    
                                               unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          uint64_t number = 0;
    
      2
          const uint64_t quarterPoints = num_points / 4;
    
          __m128 cplxValue1, cplxValue2, iValue, qValue;
    
      2
          __m128 invScalar = _mm_set_ps1(1.0 / scalar);
    
      2
          int16_t* complexVectorPtr = (int16_t*)complexVector;
    
          __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              floatBuffer[0] = (float)(complexVectorPtr[0]);
    
      65534
              floatBuffer[1] = (float)(complexVectorPtr[1]);
    
      65534
              floatBuffer[2] = (float)(complexVectorPtr[2]);
    
      65534
              floatBuffer[3] = (float)(complexVectorPtr[3]);
    
      65534
              floatBuffer[4] = (float)(complexVectorPtr[4]);
    
      65534
              floatBuffer[5] = (float)(complexVectorPtr[5]);
    
      65534
              floatBuffer[6] = (float)(complexVectorPtr[6]);
    
      65534
              floatBuffer[7] = (float)(complexVectorPtr[7]);
    
      65534
              cplxValue1 = _mm_load_ps(&floatBuffer[0]);
    
      65534
              cplxValue2 = _mm_load_ps(&floatBuffer[4]);
    
      65534
              complexVectorPtr += 8;
    
      65534
              cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
    
      65534
              cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
    
              // Arrange in i1i2i3i4 format
    
      65534
              iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
    
              // Arrange in q1q2q3q4 format
    
      65534
              qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
    
              _mm_store_ps(iBufferPtr, iValue);
    
              _mm_store_ps(qBufferPtr, qValue);
    
      65534
              iBufferPtr += 4;
    
      65534
              qBufferPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          complexVectorPtr = (int16_t*)&complexVector[number];
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
      6
              *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
    
                                                 float* qBuffer,
    
                                                 const lv_16sc_t* complexVector,
    
                                                 const float scalar,
    
                                                 unsigned int num_points)
    
      {
    
      2
          const int16_t* complexVectorPtr = (const int16_t*)complexVector;
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
          unsigned int number;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
      262142
              *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
    
                                                                 float* qBuffer,
    
                                                                 const lv_16sc_t* complexVector,
    
                                                                 const float scalar,
    
                                                                 unsigned int num_points)
    
      {
    
          const int16_t* complexVectorPtr = (const int16_t*)complexVector;
    
          float* iBufferPtr = iBuffer;
    
          float* qBufferPtr = qBuffer;
    
          unsigned int eighth_points = num_points / 4;
    
          unsigned int number;
    
          float iScalar = 1.f / scalar;
    
          float32x4_t invScalar;
    
          invScalar = vld1q_dup_f32(&iScalar);
    
          int16x4x2_t complexInput_s16;
    
          int32x4x2_t complexInput_s32;
    
          float32x4x2_t complexFloat;
    
          for (number = 0; number < eighth_points; number++) {
    
              complexInput_s16 = vld2_s16(complexVectorPtr);
    
              complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
    
              complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
    
              complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
    
              complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
    
              complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
    
              complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
    
              vst1q_f32(iBufferPtr, complexFloat.val[0]);
    
              vst1q_f32(qBufferPtr, complexFloat.val[1]);
    
              complexVectorPtr += 8;
    
              iBufferPtr += 4;
    
              qBufferPtr += 4;
    
          }
    
          for (number = eighth_points * 4; number < num_points; number++) {
    
              *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
              *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
          }
    
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
    
                                                                float* qBuffer,
    
                                                                const lv_16sc_t* complexVector,
    
                                                                const float scalar,
    
                                                                unsigned int num_points);
    
      static inline void
    
      2
      volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
    
                                               float* qBuffer,
    
                                               const lv_16sc_t* complexVector,
    
                                               const float scalar,
    
                                               unsigned int num_points)
    
      {
    
      2
          volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
    
              iBuffer, qBuffer, complexVector, scalar, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_ORC */
    
      #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
    
      #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
    
      #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
    
                                                float* qBuffer,
    
                                                const lv_16sc_t* complexVector,
    
                                                const float scalar,
    
                                                unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          uint64_t number = 0;
    
      2
          const uint64_t eighthPoints = num_points / 8;
    
          __m256 cplxValue1, cplxValue2, iValue, qValue;
    
          __m256i cplxValueA, cplxValueB;
    
          __m128i cplxValue128;
    
      2
          __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
    
      2
          int16_t* complexVectorPtr = (int16_t*)complexVector;
    
      2
          __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
    
      32766
              complexVectorPtr += 16;
    
              // cvt
    
      32766
              cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
    
      32766
              cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
    
      32766
              cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
    
      32766
              cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
    
      32766
              cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
    
      32766
              cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
    
      32766
              cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
    
      32766
              cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
    
              // Arrange in i1i2i3i4 format
    
      32766
              iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
    
      32766
              iValue = _mm256_permutevar8x32_ps(iValue, idx);
    
              // Arrange in q1q2q3q4 format
    
      32766
              qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
    
      32766
              qValue = _mm256_permutevar8x32_ps(qValue, idx);
    
              _mm256_storeu_ps(iBufferPtr, iValue);
    
              _mm256_storeu_ps(qBufferPtr, qValue);
    
      32766
              iBufferPtr += 8;
    
      32766
              qBufferPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          complexVectorPtr = (int16_t*)&complexVector[number];
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
      14
              *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_16ic_s32f_deinterleave_32f_x2
12			*
13			* \b Overview
14			*
15			* Deinterleaves the complex 16 bit vector into I & Q vector data and
16			* returns the result as two vectors of floats that have been scaled.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const
21			* lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode
22			*
23			* \b Inputs
24			* \li complexVector: The complex input vector of 16-bit shorts.
25			* \li scalar: The value to be divided against each sample of the input complex vector.
26			* \li num_points: The number of complex data values to be deinterleaved.
27			*
28			* \b Outputs
29			* \li iBuffer: The floating point I buffer output data.
30			* \li qBuffer: The floating point Q buffer output data.
31			*
32			* \b Example
33			* \code
34			* int N = 10000;
35			*
36			* volk_16ic_s32f_deinterleave_32f_x2();
37			*
38			* volk_free(x);
39			* volk_free(t);
40			* \endcode
41			*/
42
43			#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
44			#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
45
46			#include <inttypes.h>
47			#include <stdio.h>
48			#include <volk/volk_common.h>
49
50			#ifdef LV_HAVE_AVX2
51			#include <immintrin.h>
52
53			static inline void
54		2	volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
55			float* qBuffer,
56			const lv_16sc_t* complexVector,
57			const float scalar,
58			unsigned int num_points)
59			{
60		2	float* iBufferPtr = iBuffer;
61		2	float* qBufferPtr = qBuffer;
62
63		2	uint64_t number = 0;
64		2	const uint64_t eighthPoints = num_points / 8;
65			__m256 cplxValue1, cplxValue2, iValue, qValue;
66			__m256i cplxValueA, cplxValueB;
67			__m128i cplxValue128;
68
69		2	__m256 invScalar = _mm256_set1_ps(1.0 / scalar);
70		2	int16_t* complexVectorPtr = (int16_t*)complexVector;
71		2	__m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
72
73	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
74
75		32766	cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
76		32766	complexVectorPtr += 16;
77
78			// cvt
79		32766	cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
80		32766	cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
81		32766	cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
82		32766	cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
83		32766	cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
84		32766	cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
85
86		32766	cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
87		32766	cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
88
89			// Arrange in i1i2i3i4 format
90		32766	iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
91		32766	iValue = _mm256_permutevar8x32_ps(iValue, idx);
92			// Arrange in q1q2q3q4 format
93		32766	qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
94		32766	qValue = _mm256_permutevar8x32_ps(qValue, idx);
95
96			_mm256_store_ps(iBufferPtr, iValue);
97			_mm256_store_ps(qBufferPtr, qValue);
98
99		32766	iBufferPtr += 8;
100		32766	qBufferPtr += 8;
101			}
102
103		2	number = eighthPoints * 8;
104		2	complexVectorPtr = (int16_t*)&complexVector[number];
105	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
106		14	iBufferPtr++ = (float)(complexVectorPtr++) / scalar;
107		14	qBufferPtr++ = (float)(complexVectorPtr++) / scalar;
108			}
109		2	}
110			#endif /* LV_HAVE_AVX2 */
111
112			#ifdef LV_HAVE_SSE
113			#include <xmmintrin.h>
114
115			static inline void
116		2	volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
117			float* qBuffer,
118			const lv_16sc_t* complexVector,
119			const float scalar,
120			unsigned int num_points)
121			{
122		2	float* iBufferPtr = iBuffer;
123		2	float* qBufferPtr = qBuffer;
124
125		2	uint64_t number = 0;
126		2	const uint64_t quarterPoints = num_points / 4;
127			__m128 cplxValue1, cplxValue2, iValue, qValue;
128
129		2	__m128 invScalar = _mm_set_ps1(1.0 / scalar);
130		2	int16_t* complexVectorPtr = (int16_t*)complexVector;
131
132			__VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
133
134	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
135
136		65534	floatBuffer[0] = (float)(complexVectorPtr[0]);
137		65534	floatBuffer[1] = (float)(complexVectorPtr[1]);
138		65534	floatBuffer[2] = (float)(complexVectorPtr[2]);
139		65534	floatBuffer[3] = (float)(complexVectorPtr[3]);
140
141		65534	floatBuffer[4] = (float)(complexVectorPtr[4]);
142		65534	floatBuffer[5] = (float)(complexVectorPtr[5]);
143		65534	floatBuffer[6] = (float)(complexVectorPtr[6]);
144		65534	floatBuffer[7] = (float)(complexVectorPtr[7]);
145
146		65534	cplxValue1 = _mm_load_ps(&floatBuffer[0]);
147		65534	cplxValue2 = _mm_load_ps(&floatBuffer[4]);
148
149		65534	complexVectorPtr += 8;
150
151		65534	cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
152		65534	cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
153
154			// Arrange in i1i2i3i4 format
155		65534	iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
156			// Arrange in q1q2q3q4 format
157		65534	qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
158
159			_mm_store_ps(iBufferPtr, iValue);
160			_mm_store_ps(qBufferPtr, qValue);
161
162		65534	iBufferPtr += 4;
163		65534	qBufferPtr += 4;
164			}
165
166		2	number = quarterPoints * 4;
167		2	complexVectorPtr = (int16_t*)&complexVector[number];
168	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
169		6	iBufferPtr++ = (float)(complexVectorPtr++) / scalar;
170		6	qBufferPtr++ = (float)(complexVectorPtr++) / scalar;
171			}
172		2	}
173			#endif /* LV_HAVE_SSE */
174
175			#ifdef LV_HAVE_GENERIC
176
177			static inline void
178		2	volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
179			float* qBuffer,
180			const lv_16sc_t* complexVector,
181			const float scalar,
182			unsigned int num_points)
183			{
184		2	const int16_t* complexVectorPtr = (const int16_t*)complexVector;
185		2	float* iBufferPtr = iBuffer;
186		2	float* qBufferPtr = qBuffer;
187			unsigned int number;
188	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
189		262142	iBufferPtr++ = (float)(complexVectorPtr++) / scalar;
190		262142	qBufferPtr++ = (float)(complexVectorPtr++) / scalar;
191			}
192		2	}
193			#endif /* LV_HAVE_GENERIC */
194
195			#ifdef LV_HAVE_NEON
196			#include <arm_neon.h>
197			static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
198			float* qBuffer,
199			const lv_16sc_t* complexVector,
200			const float scalar,
201			unsigned int num_points)
202			{
203			const int16_t* complexVectorPtr = (const int16_t*)complexVector;
204			float* iBufferPtr = iBuffer;
205			float* qBufferPtr = qBuffer;
206			unsigned int eighth_points = num_points / 4;
207			unsigned int number;
208			float iScalar = 1.f / scalar;
209			float32x4_t invScalar;
210			invScalar = vld1q_dup_f32(&iScalar);
211
212			int16x4x2_t complexInput_s16;
213			int32x4x2_t complexInput_s32;
214			float32x4x2_t complexFloat;
215
216			for (number = 0; number < eighth_points; number++) {
217			complexInput_s16 = vld2_s16(complexVectorPtr);
218			complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
219			complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
220			complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
221			complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
222			complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
223			complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
224			vst1q_f32(iBufferPtr, complexFloat.val[0]);
225			vst1q_f32(qBufferPtr, complexFloat.val[1]);
226			complexVectorPtr += 8;
227			iBufferPtr += 4;
228			qBufferPtr += 4;
229			}
230
231			for (number = eighth_points * 4; number < num_points; number++) {
232			iBufferPtr++ = (float)(complexVectorPtr++) / scalar;
233			qBufferPtr++ = (float)(complexVectorPtr++) / scalar;
234			}
235			}
236			#endif /* LV_HAVE_GENERIC */
237
238			#ifdef LV_HAVE_ORC
239			extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
240			float* qBuffer,
241			const lv_16sc_t* complexVector,
242			const float scalar,
243			unsigned int num_points);
244
245			static inline void
246		2	volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
247			float* qBuffer,
248			const lv_16sc_t* complexVector,
249			const float scalar,
250			unsigned int num_points)
251			{
252		2	volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
253			iBuffer, qBuffer, complexVector, scalar, num_points);
254		2	}
255			#endif /* LV_HAVE_ORC */
256
257
258			#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
259
260
261			#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
262			#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
263
264			#include <inttypes.h>
265			#include <stdio.h>
266			#include <volk/volk_common.h>
267
268			#ifdef LV_HAVE_AVX2
269			#include <immintrin.h>
270
271			static inline void
272		2	volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
273			float* qBuffer,
274			const lv_16sc_t* complexVector,
275			const float scalar,
276			unsigned int num_points)
277			{
278		2	float* iBufferPtr = iBuffer;
279		2	float* qBufferPtr = qBuffer;
280
281		2	uint64_t number = 0;
282		2	const uint64_t eighthPoints = num_points / 8;
283			__m256 cplxValue1, cplxValue2, iValue, qValue;
284			__m256i cplxValueA, cplxValueB;
285			__m128i cplxValue128;
286
287		2	__m256 invScalar = _mm256_set1_ps(1.0 / scalar);
288		2	int16_t* complexVectorPtr = (int16_t*)complexVector;
289		2	__m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
290
291	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
292
293		32766	cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
294		32766	complexVectorPtr += 16;
295
296			// cvt
297		32766	cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
298		32766	cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
299		32766	cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
300		32766	cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
301		32766	cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
302		32766	cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
303
304		32766	cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
305		32766	cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
306
307			// Arrange in i1i2i3i4 format
308		32766	iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
309		32766	iValue = _mm256_permutevar8x32_ps(iValue, idx);
310			// Arrange in q1q2q3q4 format
311		32766	qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
312		32766	qValue = _mm256_permutevar8x32_ps(qValue, idx);
313
314			_mm256_storeu_ps(iBufferPtr, iValue);
315			_mm256_storeu_ps(qBufferPtr, qValue);
316
317		32766	iBufferPtr += 8;
318		32766	qBufferPtr += 8;
319			}
320
321		2	number = eighthPoints * 8;
322		2	complexVectorPtr = (int16_t*)&complexVector[number];
323	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
324		14	iBufferPtr++ = (float)(complexVectorPtr++) / scalar;
325		14	qBufferPtr++ = (float)(complexVectorPtr++) / scalar;
326			}
327		2	}
328			#endif /* LV_HAVE_AVX2 */
329
330			#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
331