GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	156	156	100.0%
Functions:	5	5	100.0%
Branches:	18	18	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_8ic_s32f_deinterleave_32f_x2
    
       *
    
       * \b Overview
    
       *
    
       * Deinterleaves the complex 8-bit char vector into I & Q vector data,
    
       * converts them to floats, and divides the results by the scalar
    
       * factor.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
    
       * complexVector, const float scalar, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li complexVector: The complex input vector.
    
       * \li scalar: The scalar value used to divide the floating point results.
    
       * \li num_points: The number of complex data values to be deinterleaved.
    
       *
    
       * \b Outputs
    
       * \li iBuffer: The I buffer output data.
    
       * \li qBuffer: The Q buffer output data.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * volk_8ic_s32f_deinterleave_32f_x2();
    
       *
    
       * volk_free(x);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
    
      #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
    
                                                 float* qBuffer,
    
                                                 const lv_8sc_t* complexVector,
    
                                                 const float scalar,
    
                                                 unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
          __m128 iFloatValue, qFloatValue;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m128 invScalar = _mm_set_ps1(iScalar);
    
          __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
    
      2
          int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          __m128i iMoveMask = _mm_set_epi8(
    
              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
    
      2
          __m128i qMoveMask = _mm_set_epi8(
    
              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
    
      32766
              complexVectorPtr += 16;
    
      32766
              iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
    
      32766
              qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
    
      32766
              iIntVal = _mm_cvtepi8_epi32(iComplexVal);
    
      32766
              iFloatValue = _mm_cvtepi32_ps(iIntVal);
    
      32766
              iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
    
              _mm_store_ps(iBufferPtr, iFloatValue);
    
      32766
              iBufferPtr += 4;
    
      32766
              iComplexVal = _mm_srli_si128(iComplexVal, 4);
    
      32766
              iIntVal = _mm_cvtepi8_epi32(iComplexVal);
    
      32766
              iFloatValue = _mm_cvtepi32_ps(iIntVal);
    
      32766
              iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
    
              _mm_store_ps(iBufferPtr, iFloatValue);
    
      32766
              iBufferPtr += 4;
    
      32766
              qIntVal = _mm_cvtepi8_epi32(qComplexVal);
    
      32766
              qFloatValue = _mm_cvtepi32_ps(qIntVal);
    
      32766
              qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
    
              _mm_store_ps(qBufferPtr, qFloatValue);
    
      32766
              qBufferPtr += 4;
    
      32766
              qComplexVal = _mm_srli_si128(qComplexVal, 4);
    
      32766
              qIntVal = _mm_cvtepi8_epi32(qComplexVal);
    
      32766
              qFloatValue = _mm_cvtepi32_ps(qIntVal);
    
      32766
              qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
    
              _mm_store_ps(qBufferPtr, qFloatValue);
    
      32766
              qBufferPtr += 4;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
    
      14
              *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
    
                                                                 float* qBuffer,
    
                                                                 const lv_8sc_t* complexVector,
    
                                                                 const float scalar,
    
                                                                 unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 cplxValue1, cplxValue2, iValue, qValue;
    
      2
          __m128 invScalar = _mm_set_ps1(1.0 / scalar);
    
      2
          int8_t* complexVectorPtr = (int8_t*)complexVector;
    
          __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              floatBuffer[0] = (float)(complexVectorPtr[0]);
    
      65534
              floatBuffer[1] = (float)(complexVectorPtr[1]);
    
      65534
              floatBuffer[2] = (float)(complexVectorPtr[2]);
    
      65534
              floatBuffer[3] = (float)(complexVectorPtr[3]);
    
      65534
              floatBuffer[4] = (float)(complexVectorPtr[4]);
    
      65534
              floatBuffer[5] = (float)(complexVectorPtr[5]);
    
      65534
              floatBuffer[6] = (float)(complexVectorPtr[6]);
    
      65534
              floatBuffer[7] = (float)(complexVectorPtr[7]);
    
      65534
              cplxValue1 = _mm_load_ps(&floatBuffer[0]);
    
      65534
              cplxValue2 = _mm_load_ps(&floatBuffer[4]);
    
      65534
              complexVectorPtr += 8;
    
      65534
              cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
    
      65534
              cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
    
              // Arrange in i1i2i3i4 format
    
      65534
              iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
    
      65534
              qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
    
              _mm_store_ps(iBufferPtr, iValue);
    
              _mm_store_ps(qBufferPtr, qValue);
    
      65534
              iBufferPtr += 4;
    
      65534
              qBufferPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          complexVectorPtr = (int8_t*)&complexVector[number];
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
      6
              *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
    
                                                                  float* qBuffer,
    
                                                                  const lv_8sc_t* complexVector,
    
                                                                  const float scalar,
    
                                                                  unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
          __m256 iFloatValue, qFloatValue;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m256 invScalar = _mm256_set1_ps(iScalar);
    
          __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
    
      2
          int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          __m256i iMoveMask = _mm256_set_epi8(0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              14,
    
                                              12,
    
                                              10,
    
                                              8,
    
                                              6,
    
                                              4,
    
                                              2,
    
                                              0,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              14,
    
                                              12,
    
                                              10,
    
                                              8,
    
                                              6,
    
                                              4,
    
                                              2,
    
                                              0);
    
      2
          __m256i qMoveMask = _mm256_set_epi8(0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              15,
    
                                              13,
    
                                              11,
    
                                              9,
    
                                              7,
    
                                              5,
    
                                              3,
    
                                              1,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              0x80,
    
                                              15,
    
                                              13,
    
                                              11,
    
                                              9,
    
                                              7,
    
                                              5,
    
                                              3,
    
                                              1);
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
    
      16382
              complexVectorPtr += 32;
    
      16382
              iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
    
      16382
              qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
    
      32764
              iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
    
      16382
              iFloatValue = _mm256_cvtepi32_ps(iIntVal);
    
      16382
              iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
    
              _mm256_store_ps(iBufferPtr, iFloatValue);
    
      16382
              iBufferPtr += 8;
    
      16382
              iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
    
      32764
              iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
    
      16382
              iFloatValue = _mm256_cvtepi32_ps(iIntVal);
    
      16382
              iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
    
              _mm256_store_ps(iBufferPtr, iFloatValue);
    
      16382
              iBufferPtr += 8;
    
      32764
              qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
    
      16382
              qFloatValue = _mm256_cvtepi32_ps(qIntVal);
    
      16382
              qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
    
              _mm256_store_ps(qBufferPtr, qFloatValue);
    
      16382
              qBufferPtr += 8;
    
      16382
              qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
    
      32764
              qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
    
      16382
              qFloatValue = _mm256_cvtepi32_ps(qIntVal);
    
      16382
              qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
    
              _mm256_store_ps(qBufferPtr, qFloatValue);
    
      16382
              qBufferPtr += 8;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
    
      30
              *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
    
                                                float* qBuffer,
    
                                                const lv_8sc_t* complexVector,
    
                                                const float scalar,
    
                                                unsigned int num_points)
    
      {
    
      2
          const int8_t* complexVectorPtr = (const int8_t*)complexVector;
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
          unsigned int number;
    
      2
          const float invScalar = 1.0 / scalar;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
    
      262142
              *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
    
      #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
    
      #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_common.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
    
                                                                  float* qBuffer,
    
                                                                  const lv_8sc_t* complexVector,
    
                                                                  const float scalar,
    
                                                                  unsigned int num_points)
    
      {
    
      2
          float* iBufferPtr = iBuffer;
    
      2
          float* qBufferPtr = qBuffer;
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int sixteenthPoints = num_points / 16;
    
          __m256 iFloatValue, qFloatValue;
    
      2
          const float iScalar = 1.0 / scalar;
    
      2
          __m256 invScalar = _mm256_set1_ps(iScalar);
    
          __m256i complexVal, iIntVal, qIntVal;
    
          __m128i iComplexVal, qComplexVal;
    
      2
          int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          __m256i MoveMask = _mm256_set_epi8(15,
    
                                             13,
    
                                             11,
    
                                             9,
    
                                             7,
    
                                             5,
    
                                             3,
    
                                             1,
    
                                             14,
    
                                             12,
    
                                             10,
    
                                             8,
    
                                             6,
    
                                             4,
    
                                             2,
    
                                             0,
    
                                             15,
    
                                             13,
    
                                             11,
    
                                             9,
    
                                             7,
    
                                             5,
    
                                             3,
    
                                             1,
    
                                             14,
    
                                             12,
    
                                             10,
    
                                             8,
    
                                             6,
    
                                             4,
    
                                             2,
    
                                             0);
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (; number < sixteenthPoints; number++) {
    
      16382
              complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
    
      16382
              complexVectorPtr += 32;
    
      16382
              complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
    
      16382
              complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
    
      16382
              iComplexVal = _mm256_extractf128_si256(complexVal, 0);
    
      16382
              qComplexVal = _mm256_extractf128_si256(complexVal, 1);
    
      16382
              iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
    
      16382
              iFloatValue = _mm256_cvtepi32_ps(iIntVal);
    
      16382
              iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
    
              _mm256_storeu_ps(iBufferPtr, iFloatValue);
    
      16382
              iBufferPtr += 8;
    
      16382
              qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
    
      16382
              qFloatValue = _mm256_cvtepi32_ps(qIntVal);
    
      16382
              qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
    
              _mm256_storeu_ps(qBufferPtr, qFloatValue);
    
      16382
              qBufferPtr += 8;
    
      16382
              complexVal = _mm256_srli_si256(complexVal, 8);
    
      16382
              iComplexVal = _mm256_extractf128_si256(complexVal, 0);
    
      16382
              qComplexVal = _mm256_extractf128_si256(complexVal, 1);
    
      16382
              iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
    
      16382
              iFloatValue = _mm256_cvtepi32_ps(iIntVal);
    
      16382
              iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
    
              _mm256_storeu_ps(iBufferPtr, iFloatValue);
    
      16382
              iBufferPtr += 8;
    
      16382
              qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
    
      16382
              qFloatValue = _mm256_cvtepi32_ps(qIntVal);
    
      16382
              qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
    
              _mm256_storeu_ps(qBufferPtr, qFloatValue);
    
      16382
              qBufferPtr += 8;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
    
      30
              *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_8ic_s32f_deinterleave_32f_x2
12			*
13			* \b Overview
14			*
15			* Deinterleaves the complex 8-bit char vector into I & Q vector data,
16			* converts them to floats, and divides the results by the scalar
17			* factor.
18			*
19			* <b>Dispatcher Prototype</b>
20			* \code
21			* void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
22			* complexVector, const float scalar, unsigned int num_points) \endcode
23			*
24			* \b Inputs
25			* \li complexVector: The complex input vector.
26			* \li scalar: The scalar value used to divide the floating point results.
27			* \li num_points: The number of complex data values to be deinterleaved.
28			*
29			* \b Outputs
30			* \li iBuffer: The I buffer output data.
31			* \li qBuffer: The Q buffer output data.
32			*
33			* \b Example
34			* \code
35			* int N = 10000;
36			*
37			* volk_8ic_s32f_deinterleave_32f_x2();
38			*
39			* volk_free(x);
40			* \endcode
41			*/
42
43			#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
44			#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
45
46			#include <inttypes.h>
47			#include <stdio.h>
48			#include <volk/volk_common.h>
49
50
51			#ifdef LV_HAVE_SSE4_1
52			#include <smmintrin.h>
53
54			static inline void
55		2	volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
56			float* qBuffer,
57			const lv_8sc_t* complexVector,
58			const float scalar,
59			unsigned int num_points)
60			{
61		2	float* iBufferPtr = iBuffer;
62		2	float* qBufferPtr = qBuffer;
63
64		2	unsigned int number = 0;
65		2	const unsigned int eighthPoints = num_points / 8;
66			__m128 iFloatValue, qFloatValue;
67
68		2	const float iScalar = 1.0 / scalar;
69		2	__m128 invScalar = _mm_set_ps1(iScalar);
70			__m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
71		2	int8_t* complexVectorPtr = (int8_t*)complexVector;
72
73		2	__m128i iMoveMask = _mm_set_epi8(
74			0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
75		2	__m128i qMoveMask = _mm_set_epi8(
76			0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
77
78	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
79		32766	complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
80		32766	complexVectorPtr += 16;
81		32766	iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
82		32766	qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
83
84		32766	iIntVal = _mm_cvtepi8_epi32(iComplexVal);
85		32766	iFloatValue = _mm_cvtepi32_ps(iIntVal);
86		32766	iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
87			_mm_store_ps(iBufferPtr, iFloatValue);
88		32766	iBufferPtr += 4;
89
90		32766	iComplexVal = _mm_srli_si128(iComplexVal, 4);
91
92		32766	iIntVal = _mm_cvtepi8_epi32(iComplexVal);
93		32766	iFloatValue = _mm_cvtepi32_ps(iIntVal);
94		32766	iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
95			_mm_store_ps(iBufferPtr, iFloatValue);
96		32766	iBufferPtr += 4;
97
98		32766	qIntVal = _mm_cvtepi8_epi32(qComplexVal);
99		32766	qFloatValue = _mm_cvtepi32_ps(qIntVal);
100		32766	qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
101			_mm_store_ps(qBufferPtr, qFloatValue);
102		32766	qBufferPtr += 4;
103
104		32766	qComplexVal = _mm_srli_si128(qComplexVal, 4);
105
106		32766	qIntVal = _mm_cvtepi8_epi32(qComplexVal);
107		32766	qFloatValue = _mm_cvtepi32_ps(qIntVal);
108		32766	qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
109			_mm_store_ps(qBufferPtr, qFloatValue);
110
111		32766	qBufferPtr += 4;
112			}
113
114		2	number = eighthPoints * 8;
115	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
116		14	iBufferPtr++ = (float)(complexVectorPtr++) * iScalar;
117		14	qBufferPtr++ = (float)(complexVectorPtr++) * iScalar;
118			}
119		2	}
120			#endif /* LV_HAVE_SSE4_1 */
121
122
123			#ifdef LV_HAVE_SSE
124			#include <xmmintrin.h>
125
126		2	static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
127			float* qBuffer,
128			const lv_8sc_t* complexVector,
129			const float scalar,
130			unsigned int num_points)
131			{
132		2	float* iBufferPtr = iBuffer;
133		2	float* qBufferPtr = qBuffer;
134
135		2	unsigned int number = 0;
136		2	const unsigned int quarterPoints = num_points / 4;
137			__m128 cplxValue1, cplxValue2, iValue, qValue;
138
139		2	__m128 invScalar = _mm_set_ps1(1.0 / scalar);
140		2	int8_t* complexVectorPtr = (int8_t*)complexVector;
141
142			__VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
143
144	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
145		65534	floatBuffer[0] = (float)(complexVectorPtr[0]);
146		65534	floatBuffer[1] = (float)(complexVectorPtr[1]);
147		65534	floatBuffer[2] = (float)(complexVectorPtr[2]);
148		65534	floatBuffer[3] = (float)(complexVectorPtr[3]);
149
150		65534	floatBuffer[4] = (float)(complexVectorPtr[4]);
151		65534	floatBuffer[5] = (float)(complexVectorPtr[5]);
152		65534	floatBuffer[6] = (float)(complexVectorPtr[6]);
153		65534	floatBuffer[7] = (float)(complexVectorPtr[7]);
154
155		65534	cplxValue1 = _mm_load_ps(&floatBuffer[0]);
156		65534	cplxValue2 = _mm_load_ps(&floatBuffer[4]);
157
158		65534	complexVectorPtr += 8;
159
160		65534	cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
161		65534	cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
162
163			// Arrange in i1i2i3i4 format
164		65534	iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
165		65534	qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
166
167			_mm_store_ps(iBufferPtr, iValue);
168			_mm_store_ps(qBufferPtr, qValue);
169
170		65534	iBufferPtr += 4;
171		65534	qBufferPtr += 4;
172			}
173
174		2	number = quarterPoints * 4;
175		2	complexVectorPtr = (int8_t*)&complexVector[number];
176	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
177		6	iBufferPtr++ = (float)(complexVectorPtr++) / scalar;
178		6	qBufferPtr++ = (float)(complexVectorPtr++) / scalar;
179			}
180		2	}
181			#endif /* LV_HAVE_SSE */
182
183
184			#ifdef LV_HAVE_AVX2
185			#include <immintrin.h>
186
187		2	static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
188			float* qBuffer,
189			const lv_8sc_t* complexVector,
190			const float scalar,
191			unsigned int num_points)
192			{
193		2	float* iBufferPtr = iBuffer;
194		2	float* qBufferPtr = qBuffer;
195
196		2	unsigned int number = 0;
197		2	const unsigned int sixteenthPoints = num_points / 16;
198			__m256 iFloatValue, qFloatValue;
199
200		2	const float iScalar = 1.0 / scalar;
201		2	__m256 invScalar = _mm256_set1_ps(iScalar);
202			__m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
203		2	int8_t* complexVectorPtr = (int8_t*)complexVector;
204
205		2	__m256i iMoveMask = _mm256_set_epi8(0x80,
206			0x80,
207			0x80,
208			0x80,
209			0x80,
210			0x80,
211			0x80,
212			0x80,
213			14,
214			12,
215			10,
216			8,
217			6,
218			4,
219			2,
220			0,
221			0x80,
222			0x80,
223			0x80,
224			0x80,
225			0x80,
226			0x80,
227			0x80,
228			0x80,
229			14,
230			12,
231			10,
232			8,
233			6,
234			4,
235			2,
236			0);
237		2	__m256i qMoveMask = _mm256_set_epi8(0x80,
238			0x80,
239			0x80,
240			0x80,
241			0x80,
242			0x80,
243			0x80,
244			0x80,
245			15,
246			13,
247			11,
248			9,
249			7,
250			5,
251			3,
252			1,
253			0x80,
254			0x80,
255			0x80,
256			0x80,
257			0x80,
258			0x80,
259			0x80,
260			0x80,
261			15,
262			13,
263			11,
264			9,
265			7,
266			5,
267			3,
268			1);
269
270	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
271		16382	complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
272		16382	complexVectorPtr += 32;
273		16382	iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
274		16382	qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
275
276		32764	iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
277		16382	iFloatValue = _mm256_cvtepi32_ps(iIntVal);
278		16382	iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
279			_mm256_store_ps(iBufferPtr, iFloatValue);
280		16382	iBufferPtr += 8;
281
282		16382	iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
283		32764	iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
284		16382	iFloatValue = _mm256_cvtepi32_ps(iIntVal);
285		16382	iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
286			_mm256_store_ps(iBufferPtr, iFloatValue);
287		16382	iBufferPtr += 8;
288
289		32764	qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
290		16382	qFloatValue = _mm256_cvtepi32_ps(qIntVal);
291		16382	qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
292			_mm256_store_ps(qBufferPtr, qFloatValue);
293		16382	qBufferPtr += 8;
294
295		16382	qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
296		32764	qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
297		16382	qFloatValue = _mm256_cvtepi32_ps(qIntVal);
298		16382	qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
299			_mm256_store_ps(qBufferPtr, qFloatValue);
300		16382	qBufferPtr += 8;
301			}
302
303		2	number = sixteenthPoints * 16;
304	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
305		30	iBufferPtr++ = (float)(complexVectorPtr++) * iScalar;
306		30	qBufferPtr++ = (float)(complexVectorPtr++) * iScalar;
307			}
308		2	}
309			#endif /* LV_HAVE_AVX2 */
310
311
312			#ifdef LV_HAVE_GENERIC
313
314			static inline void
315		2	volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
316			float* qBuffer,
317			const lv_8sc_t* complexVector,
318			const float scalar,
319			unsigned int num_points)
320			{
321		2	const int8_t* complexVectorPtr = (const int8_t*)complexVector;
322		2	float* iBufferPtr = iBuffer;
323		2	float* qBufferPtr = qBuffer;
324			unsigned int number;
325		2	const float invScalar = 1.0 / scalar;
326	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
327		262142	iBufferPtr++ = (float)(complexVectorPtr++) * invScalar;
328		262142	qBufferPtr++ = (float)(complexVectorPtr++) * invScalar;
329			}
330		2	}
331			#endif /* LV_HAVE_GENERIC */
332
333
334			#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
335
336
337			#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
338			#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
339
340			#include <inttypes.h>
341			#include <stdio.h>
342			#include <volk/volk_common.h>
343
344			#ifdef LV_HAVE_AVX2
345			#include <immintrin.h>
346
347		2	static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
348			float* qBuffer,
349			const lv_8sc_t* complexVector,
350			const float scalar,
351			unsigned int num_points)
352			{
353		2	float* iBufferPtr = iBuffer;
354		2	float* qBufferPtr = qBuffer;
355
356		2	unsigned int number = 0;
357		2	const unsigned int sixteenthPoints = num_points / 16;
358			__m256 iFloatValue, qFloatValue;
359
360		2	const float iScalar = 1.0 / scalar;
361		2	__m256 invScalar = _mm256_set1_ps(iScalar);
362			__m256i complexVal, iIntVal, qIntVal;
363			__m128i iComplexVal, qComplexVal;
364		2	int8_t* complexVectorPtr = (int8_t*)complexVector;
365
366		2	__m256i MoveMask = _mm256_set_epi8(15,
367			13,
368			11,
369			9,
370			7,
371			5,
372			3,
373			1,
374			14,
375			12,
376			10,
377			8,
378			6,
379			4,
380			2,
381			0,
382			15,
383			13,
384			11,
385			9,
386			7,
387			5,
388			3,
389			1,
390			14,
391			12,
392			10,
393			8,
394			6,
395			4,
396			2,
397			0);
398
399	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (; number < sixteenthPoints; number++) {
400		16382	complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
401		16382	complexVectorPtr += 32;
402		16382	complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
403		16382	complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
404		16382	iComplexVal = _mm256_extractf128_si256(complexVal, 0);
405		16382	qComplexVal = _mm256_extractf128_si256(complexVal, 1);
406
407		16382	iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
408		16382	iFloatValue = _mm256_cvtepi32_ps(iIntVal);
409		16382	iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
410			_mm256_storeu_ps(iBufferPtr, iFloatValue);
411		16382	iBufferPtr += 8;
412
413		16382	qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
414		16382	qFloatValue = _mm256_cvtepi32_ps(qIntVal);
415		16382	qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
416			_mm256_storeu_ps(qBufferPtr, qFloatValue);
417		16382	qBufferPtr += 8;
418
419		16382	complexVal = _mm256_srli_si256(complexVal, 8);
420		16382	iComplexVal = _mm256_extractf128_si256(complexVal, 0);
421		16382	qComplexVal = _mm256_extractf128_si256(complexVal, 1);
422
423		16382	iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
424		16382	iFloatValue = _mm256_cvtepi32_ps(iIntVal);
425		16382	iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
426			_mm256_storeu_ps(iBufferPtr, iFloatValue);
427		16382	iBufferPtr += 8;
428
429		16382	qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
430		16382	qFloatValue = _mm256_cvtepi32_ps(qIntVal);
431		16382	qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
432			_mm256_storeu_ps(qBufferPtr, qFloatValue);
433		16382	qBufferPtr += 8;
434			}
435
436		2	number = sixteenthPoints * 16;
437	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
438		30	iBufferPtr++ = (float)(complexVectorPtr++) * iScalar;
439		30	qBufferPtr++ = (float)(complexVectorPtr++) * iScalar;
440			}
441		2	}
442			#endif /* LV_HAVE_AVX2 */
443
444			#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
445