GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_8ic_deinterleave_16i_x2.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	123	123	100.0%
Functions:	5	5	100.0%
Branches:	18	18	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_8ic_deinterleave_16i_x2
    
       *
    
       * \b Overview
    
       *
    
       * Deinterleaves the complex 8-bit char vector into I & Q vector data
    
       * and converts them to 16-bit shorts.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
    
       * complexVector, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li complexVector: The complex input vector.
    
       * \li num_points: The number of complex data values to be deinterleaved.
    
       *
    
       * \b Outputs
    
       * \li iBuffer: The I buffer output data.
    
       * \li qBuffer: The Q buffer output data.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * volk_8ic_deinterleave_16i_x2();
    
       *
    
       * volk_free(x);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
    
      #define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
    
                                                             int16_t* qBuffer,
    
                                                             const lv_8sc_t* complexVector,
    
                                                             unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          int16_t* iBufferPtr = iBuffer;
    
      2
          int16_t* qBufferPtr = qBuffer;
    
      2
          __m256i MoveMask = _mm256_set_epi8(15,
    
                                             13,
    
                                             11,
    
                                             9,
    
                                             7,
    
                                             5,
    
                                             3,
    
                                             1,
    
                                             14,
    
                                             12,
    
                                             10,
    
                                             8,
    
                                             6,
    
                                             4,
    
                                             2,
    
                                             0,
    
                                             15,
    
                                             13,
    
                                             11,
    
                                             9,
    
                                             7,
    
                                             5,
    
                                             3,
    
                                             1,
    
                                             14,
    
                                             12,
    
                                             10,
    
                                             8,
    
                                             6,
    
                                             4,
    
                                             2,
    
                                             0);
    
          __m256i complexVal, iOutputVal, qOutputVal;
    
          __m128i iOutputVal0, qOutputVal0;
    
      2
          unsigned int sixteenthPoints = num_points / 16;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (number = 0; number < sixteenthPoints; number++) {
    
      16382
              complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
    
      16382
              complexVectorPtr += 32;
    
      16382
              complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
    
      16382
              complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
    
      16382
              iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
    
      16382
              qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
    
      16382
              iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
    
      16382
              iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
    
      16382
              qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
    
      16382
              qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
    
              _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
    
              _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
    
      16382
              iBufferPtr += 16;
    
      16382
              qBufferPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              *iBufferPtr++ =
    
      30
                  ((int16_t)*complexVectorPtr++) *
    
                  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
    
      30
              *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
    
                                                               int16_t* qBuffer,
    
                                                               const lv_8sc_t* complexVector,
    
                                                               unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          int16_t* iBufferPtr = iBuffer;
    
      2
          int16_t* qBufferPtr = qBuffer;
    
      2
          __m128i iMoveMask = _mm_set_epi8(0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           14,
    
                                           12,
    
                                           10,
    
                                           8,
    
                                           6,
    
                                           4,
    
                                           2,
    
                                           0); // set 16 byte values
    
      2
          __m128i qMoveMask = _mm_set_epi8(
    
              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
    
          __m128i complexVal, iOutputVal, qOutputVal;
    
      2
          unsigned int eighthPoints = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (number = 0; number < eighthPoints; number++) {
    
      32766
              complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
    
      32766
              complexVectorPtr += 16; // aligned load
    
      32766
              iOutputVal = _mm_shuffle_epi8(complexVal,
    
                                            iMoveMask); // shuffle 16 bytes of 128bit complexVal
    
      32766
              qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
    
      32766
              iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
    
                                                          // of lower 8 bytes of input to output
    
              iOutputVal =
    
      32766
                  _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
    
                                                 // 16-bit integers, shift in with zeros
    
      32766
              qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
    
      32766
              qOutputVal = _mm_slli_epi16(qOutputVal, 8);
    
              _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
    
              _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
    
      32766
              iBufferPtr += 8;
    
      32766
              qBufferPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *iBufferPtr++ =
    
      14
                  ((int16_t)*complexVectorPtr++) *
    
                  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
    
      14
              *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
    
                                                            int16_t* qBuffer,
    
                                                            const lv_8sc_t* complexVector,
    
                                                            unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          int16_t* iBufferPtr = iBuffer;
    
      2
          int16_t* qBufferPtr = qBuffer;
    
      2
          __m128i iMoveMask = _mm_set_epi8(0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           0x80,
    
                                           14,
    
                                           12,
    
                                           10,
    
                                           8,
    
                                           6,
    
                                           4,
    
                                           2,
    
                                           0); // set 16 byte values
    
      2
          __m128i qMoveMask = _mm_set_epi8(
    
              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
    
          __m256i complexVal, iOutputVal, qOutputVal;
    
          __m128i complexVal1, complexVal0;
    
          __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
    
      2
          unsigned int sixteenthPoints = num_points / 16;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (number = 0; number < sixteenthPoints; number++) {
    
      16382
              complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
    
      16382
              complexVectorPtr += 32; // aligned load
    
              // Extract from complexVal to iOutputVal and qOutputVal
    
      16382
              complexVal1 = _mm256_extractf128_si256(complexVal, 1);
    
      16382
              complexVal0 = _mm256_extractf128_si256(complexVal, 0);
    
      16382
              iOutputVal1 = _mm_shuffle_epi8(
    
                  complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
    
      16382
              iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
    
      16382
              qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
    
      16382
              qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
    
              iOutputVal1 =
    
      16382
                  _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
    
                                                  // lower 8 bytes of input to output
    
              iOutputVal1 =
    
      16382
                  _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
    
                                                  // 16-bit integers, shift in with zeros
    
      16382
              iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
    
      16382
              iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
    
      16382
              qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
    
      16382
              qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
    
      16382
              qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
    
      16382
              qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
    
              // Pack iOutputVal0,1 to iOutputVal
    
      16382
              __m256i dummy = _mm256_setzero_si256();
    
      16382
              iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
    
      16382
              iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
    
      16382
              qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
    
      16382
              qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
    
              _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
    
              _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
    
      16382
              iBufferPtr += 16;
    
      16382
              qBufferPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              *iBufferPtr++ =
    
      30
                  ((int16_t)*complexVectorPtr++) *
    
                  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
    
      30
              *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
    
                                                              int16_t* qBuffer,
    
                                                              const lv_8sc_t* complexVector,
    
                                                              unsigned int num_points)
    
      {
    
      2
          const int8_t* complexVectorPtr = (const int8_t*)complexVector;
    
      2
          int16_t* iBufferPtr = iBuffer;
    
      2
          int16_t* qBufferPtr = qBuffer;
    
          unsigned int number;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
    
      262142
              *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
    
      #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
    
      #define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
    
                                                             int16_t* qBuffer,
    
                                                             const lv_8sc_t* complexVector,
    
                                                             unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const int8_t* complexVectorPtr = (int8_t*)complexVector;
    
      2
          int16_t* iBufferPtr = iBuffer;
    
      2
          int16_t* qBufferPtr = qBuffer;
    
      2
          __m256i MoveMask = _mm256_set_epi8(15,
    
                                             13,
    
                                             11,
    
                                             9,
    
                                             7,
    
                                             5,
    
                                             3,
    
                                             1,
    
                                             14,
    
                                             12,
    
                                             10,
    
                                             8,
    
                                             6,
    
                                             4,
    
                                             2,
    
                                             0,
    
                                             15,
    
                                             13,
    
                                             11,
    
                                             9,
    
                                             7,
    
                                             5,
    
                                             3,
    
                                             1,
    
                                             14,
    
                                             12,
    
                                             10,
    
                                             8,
    
                                             6,
    
                                             4,
    
                                             2,
    
                                             0);
    
          __m256i complexVal, iOutputVal, qOutputVal;
    
          __m128i iOutputVal0, qOutputVal0;
    
      2
          unsigned int sixteenthPoints = num_points / 16;
    
        2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.

      16384
          for (number = 0; number < sixteenthPoints; number++) {
    
      16382
              complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
    
      16382
              complexVectorPtr += 32;
    
      16382
              complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
    
      16382
              complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
    
      16382
              iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
    
      16382
              qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
    
      16382
              iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
    
      16382
              iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
    
      16382
              qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
    
      16382
              qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
    
              _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
    
              _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
    
      16382
              iBufferPtr += 16;
    
      16382
              qBufferPtr += 16;
    
          }
    
      2
          number = sixteenthPoints * 16;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.

      32
          for (; number < num_points; number++) {
    
      30
              *iBufferPtr++ =
    
      30
                  ((int16_t)*complexVectorPtr++) *
    
                  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
    
      30
              *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_8ic_deinterleave_16i_x2
12			*
13			* \b Overview
14			*
15			* Deinterleaves the complex 8-bit char vector into I & Q vector data
16			* and converts them to 16-bit shorts.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
21			* complexVector, unsigned int num_points) \endcode
22			*
23			* \b Inputs
24			* \li complexVector: The complex input vector.
25			* \li num_points: The number of complex data values to be deinterleaved.
26			*
27			* \b Outputs
28			* \li iBuffer: The I buffer output data.
29			* \li qBuffer: The Q buffer output data.
30			*
31			* \b Example
32			* \code
33			* int N = 10000;
34			*
35			* volk_8ic_deinterleave_16i_x2();
36			*
37			* volk_free(x);
38			* \endcode
39			*/
40
41			#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
42			#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
43
44			#include <inttypes.h>
45			#include <stdio.h>
46
47			#ifdef LV_HAVE_AVX2
48			#include <immintrin.h>
49
50		2	static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
51			int16_t* qBuffer,
52			const lv_8sc_t* complexVector,
53			unsigned int num_points)
54			{
55		2	unsigned int number = 0;
56		2	const int8_t* complexVectorPtr = (int8_t*)complexVector;
57		2	int16_t* iBufferPtr = iBuffer;
58		2	int16_t* qBufferPtr = qBuffer;
59		2	__m256i MoveMask = _mm256_set_epi8(15,
60			13,
61			11,
62			9,
63			7,
64			5,
65			3,
66			1,
67			14,
68			12,
69			10,
70			8,
71			6,
72			4,
73			2,
74			0,
75			15,
76			13,
77			11,
78			9,
79			7,
80			5,
81			3,
82			1,
83			14,
84			12,
85			10,
86			8,
87			6,
88			4,
89			2,
90			0);
91			__m256i complexVal, iOutputVal, qOutputVal;
92			__m128i iOutputVal0, qOutputVal0;
93
94		2	unsigned int sixteenthPoints = num_points / 16;
95
96	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (number = 0; number < sixteenthPoints; number++) {
97		16382	complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
98		16382	complexVectorPtr += 32;
99
100		16382	complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
101		16382	complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
102
103		16382	iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
104		16382	qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
105
106		16382	iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
107		16382	iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
108
109		16382	qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
110		16382	qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
111
112			_mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
113			_mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
114
115		16382	iBufferPtr += 16;
116		16382	qBufferPtr += 16;
117			}
118
119		2	number = sixteenthPoints * 16;
120	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
121		30	*iBufferPtr++ =
122		30	((int16_t)complexVectorPtr++)
123			256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
124		30	qBufferPtr++ = ((int16_t)complexVectorPtr++) * 256;
125			}
126		2	}
127			#endif /* LV_HAVE_AVX2 */
128
129			#ifdef LV_HAVE_SSE4_1
130			#include <smmintrin.h>
131
132		2	static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
133			int16_t* qBuffer,
134			const lv_8sc_t* complexVector,
135			unsigned int num_points)
136			{
137		2	unsigned int number = 0;
138		2	const int8_t* complexVectorPtr = (int8_t*)complexVector;
139		2	int16_t* iBufferPtr = iBuffer;
140		2	int16_t* qBufferPtr = qBuffer;
141		2	__m128i iMoveMask = _mm_set_epi8(0x80,
142			0x80,
143			0x80,
144			0x80,
145			0x80,
146			0x80,
147			0x80,
148			0x80,
149			14,
150			12,
151			10,
152			8,
153			6,
154			4,
155			2,
156			0); // set 16 byte values
157		2	__m128i qMoveMask = _mm_set_epi8(
158			0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
159			__m128i complexVal, iOutputVal, qOutputVal;
160
161		2	unsigned int eighthPoints = num_points / 8;
162
163	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (number = 0; number < eighthPoints; number++) {
164		32766	complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
165		32766	complexVectorPtr += 16; // aligned load
166
167		32766	iOutputVal = _mm_shuffle_epi8(complexVal,
168			iMoveMask); // shuffle 16 bytes of 128bit complexVal
169		32766	qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
170
171		32766	iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
172			// of lower 8 bytes of input to output
173			iOutputVal =
174		32766	_mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
175			// 16-bit integers, shift in with zeros
176
177		32766	qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
178		32766	qOutputVal = _mm_slli_epi16(qOutputVal, 8);
179
180			_mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
181			_mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
182
183		32766	iBufferPtr += 8;
184		32766	qBufferPtr += 8;
185			}
186
187		2	number = eighthPoints * 8;
188	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
189		14	*iBufferPtr++ =
190		14	((int16_t)complexVectorPtr++)
191			256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
192		14	qBufferPtr++ = ((int16_t)complexVectorPtr++) * 256;
193			}
194		2	}
195			#endif /* LV_HAVE_SSE4_1 */
196
197
198			#ifdef LV_HAVE_AVX
199			#include <immintrin.h>
200
201		2	static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
202			int16_t* qBuffer,
203			const lv_8sc_t* complexVector,
204			unsigned int num_points)
205			{
206		2	unsigned int number = 0;
207		2	const int8_t* complexVectorPtr = (int8_t*)complexVector;
208		2	int16_t* iBufferPtr = iBuffer;
209		2	int16_t* qBufferPtr = qBuffer;
210		2	__m128i iMoveMask = _mm_set_epi8(0x80,
211			0x80,
212			0x80,
213			0x80,
214			0x80,
215			0x80,
216			0x80,
217			0x80,
218			14,
219			12,
220			10,
221			8,
222			6,
223			4,
224			2,
225			0); // set 16 byte values
226		2	__m128i qMoveMask = _mm_set_epi8(
227			0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
228			__m256i complexVal, iOutputVal, qOutputVal;
229			__m128i complexVal1, complexVal0;
230			__m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
231
232		2	unsigned int sixteenthPoints = num_points / 16;
233
234	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (number = 0; number < sixteenthPoints; number++) {
235		16382	complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
236		16382	complexVectorPtr += 32; // aligned load
237
238			// Extract from complexVal to iOutputVal and qOutputVal
239		16382	complexVal1 = _mm256_extractf128_si256(complexVal, 1);
240		16382	complexVal0 = _mm256_extractf128_si256(complexVal, 0);
241
242		16382	iOutputVal1 = _mm_shuffle_epi8(
243			complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
244		16382	iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
245		16382	qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
246		16382	qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
247
248			iOutputVal1 =
249		16382	_mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
250			// lower 8 bytes of input to output
251			iOutputVal1 =
252		16382	_mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
253			// 16-bit integers, shift in with zeros
254		16382	iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
255		16382	iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
256
257		16382	qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
258		16382	qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
259		16382	qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
260		16382	qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
261
262			// Pack iOutputVal0,1 to iOutputVal
263		16382	__m256i dummy = _mm256_setzero_si256();
264		16382	iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
265		16382	iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
266		16382	qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
267		16382	qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
268
269			_mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
270			_mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
271
272		16382	iBufferPtr += 16;
273		16382	qBufferPtr += 16;
274			}
275
276		2	number = sixteenthPoints * 16;
277	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
278		30	*iBufferPtr++ =
279		30	((int16_t)complexVectorPtr++)
280			256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
281		30	qBufferPtr++ = ((int16_t)complexVectorPtr++) * 256;
282			}
283		2	}
284			#endif /* LV_HAVE_AVX */
285
286
287			#ifdef LV_HAVE_GENERIC
288
289		2	static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
290			int16_t* qBuffer,
291			const lv_8sc_t* complexVector,
292			unsigned int num_points)
293			{
294		2	const int8_t* complexVectorPtr = (const int8_t*)complexVector;
295		2	int16_t* iBufferPtr = iBuffer;
296		2	int16_t* qBufferPtr = qBuffer;
297			unsigned int number;
298	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
299		262142	iBufferPtr++ = (int16_t)(complexVectorPtr++) * 256;
300		262142	qBufferPtr++ = (int16_t)(complexVectorPtr++) * 256;
301			}
302		2	}
303			#endif /* LV_HAVE_GENERIC */
304
305
306			#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
307
308			#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
309			#define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
310
311			#include <inttypes.h>
312			#include <stdio.h>
313
314			#ifdef LV_HAVE_AVX2
315			#include <immintrin.h>
316
317		2	static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
318			int16_t* qBuffer,
319			const lv_8sc_t* complexVector,
320			unsigned int num_points)
321			{
322		2	unsigned int number = 0;
323		2	const int8_t* complexVectorPtr = (int8_t*)complexVector;
324		2	int16_t* iBufferPtr = iBuffer;
325		2	int16_t* qBufferPtr = qBuffer;
326		2	__m256i MoveMask = _mm256_set_epi8(15,
327			13,
328			11,
329			9,
330			7,
331			5,
332			3,
333			1,
334			14,
335			12,
336			10,
337			8,
338			6,
339			4,
340			2,
341			0,
342			15,
343			13,
344			11,
345			9,
346			7,
347			5,
348			3,
349			1,
350			14,
351			12,
352			10,
353			8,
354			6,
355			4,
356			2,
357			0);
358			__m256i complexVal, iOutputVal, qOutputVal;
359			__m128i iOutputVal0, qOutputVal0;
360
361		2	unsigned int sixteenthPoints = num_points / 16;
362
363	2/2 ✓ Branch 0 taken 16382 times. ✓ Branch 1 taken 2 times.	16384	for (number = 0; number < sixteenthPoints; number++) {
364		16382	complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
365		16382	complexVectorPtr += 32;
366
367		16382	complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
368		16382	complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
369
370		16382	iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
371		16382	qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
372
373		16382	iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
374		16382	iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
375
376		16382	qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
377		16382	qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
378
379			_mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
380			_mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
381
382		16382	iBufferPtr += 16;
383		16382	qBufferPtr += 16;
384			}
385
386		2	number = sixteenthPoints * 16;
387	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 2 times.	32	for (; number < num_points; number++) {
388		30	*iBufferPtr++ =
389		30	((int16_t)complexVectorPtr++)
390			256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
391		30	qBufferPtr++ = ((int16_t)complexVectorPtr++) * 256;
392			}
393		2	}
394			#endif /* LV_HAVE_AVX2 */
395			#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
396