GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	135	135	100.0%
Functions:	4	4	100.0%
Branches:	14	14	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_8ic_x2_s32f_multiply_conjugate_32fc
    
       *
    
       * \b Overview
    
       *
    
       * Multiplys the one complex vector with the complex conjugate of the
    
       * second complex vector and stores their results in the third vector
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
    
       * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: One of the complex vectors to be multiplied.
    
       * \li bVector: The complex vector which will be converted to complex conjugate and
    
       * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
    
       * number of complex values in aVector and bVector to be multiplied together and stored
    
       * into cVector.
    
       *
    
       * \b Outputs
    
       * \li cVector: The complex vector where the results will be stored.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * <FIXME>
    
       *
    
       * volk_8ic_x2_s32f_multiply_conjugate_32fc();
    
       *
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
    
      #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
    
                                                      const lv_8sc_t* aVector,
    
                                                      const lv_8sc_t* bVector,
    
                                                      const float scalar,
    
                                                      unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int oneEigthPoints = num_points / 8;
    
          __m256i x, y, realz, imagz;
    
          __m256 ret, retlo, rethi;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_8sc_t* a = aVector;
    
      2
          const lv_8sc_t* b = bVector;
    
          __m256i conjugateSign =
    
      2
              _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
    
      2
          __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < oneEigthPoints; number++) {
    
              // Convert  8 bit values into 16 bit values
    
      65532
              x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
    
      65532
              y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
    
              // Calculate the ar*cr - ai*(-ci) portions
    
      32766
              realz = _mm256_madd_epi16(x, y);
    
              // Calculate the complex conjugate of the cr + ci j values
    
      32766
              y = _mm256_sign_epi16(y, conjugateSign);
    
              // Shift the order of the cr and ci values
    
      32766
              y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
    
                                         _MM_SHUFFLE(2, 3, 0, 1));
    
              // Calculate the ar*(-ci) + cr*(ai)
    
      32766
              imagz = _mm256_madd_epi16(x, y);
    
              // Interleave real and imaginary and then convert to float values
    
      65532
              retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
    
              // Normalize the floating point values
    
      32766
              retlo = _mm256_mul_ps(retlo, invScalar);
    
              // Interleave real and imaginary and then convert to float values
    
      65532
              rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
    
              // Normalize the floating point values
    
      32766
              rethi = _mm256_mul_ps(rethi, invScalar);
    
      32766
              ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
    
              _mm256_store_ps((float*)c, ret);
    
      32766
              c += 4;
    
      32766
              ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
    
              _mm256_store_ps((float*)c, ret);
    
      32766
              c += 4;
    
      32766
              a += 8;
    
      32766
              b += 8;
    
          }
    
      2
          number = oneEigthPoints * 8;
    
      2
          float* cFloatPtr = (float*)&cVector[number];
    
      2
          int8_t* a8Ptr = (int8_t*)&aVector[number];
    
      2
          int8_t* b8Ptr = (int8_t*)&bVector[number];
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              float aReal = (float)*a8Ptr++;
    
      14
              float aImag = (float)*a8Ptr++;
    
      14
              lv_32fc_t aVal = lv_cmake(aReal, aImag);
    
      14
              float bReal = (float)*b8Ptr++;
    
      14
              float bImag = (float)*b8Ptr++;
    
      14
              lv_32fc_t bVal = lv_cmake(bReal, -bImag);
    
      14
              lv_32fc_t temp = aVal * bVal;
    
      14
              *cFloatPtr++ = lv_creal(temp) / scalar;
    
      14
              *cFloatPtr++ = lv_cimag(temp) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2*/
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      static inline void
    
      2
      volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
    
                                                        const lv_8sc_t* aVector,
    
                                                        const lv_8sc_t* bVector,
    
                                                        const float scalar,
    
                                                        unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128i x, y, realz, imagz;
    
          __m128 ret;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_8sc_t* a = aVector;
    
      2
          const lv_8sc_t* b = bVector;
    
      2
          __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
    
      2
          __m128 invScalar = _mm_set_ps1(1.0 / scalar);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
              // Convert into 8 bit values into 16 bit values
    
      131068
              x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
    
      131068
              y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
    
              // Calculate the ar*cr - ai*(-ci) portions
    
      65534
              realz = _mm_madd_epi16(x, y);
    
              // Calculate the complex conjugate of the cr + ci j values
    
      65534
              y = _mm_sign_epi16(y, conjugateSign);
    
              // Shift the order of the cr and ci values
    
      65534
              y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
    
                                      _MM_SHUFFLE(2, 3, 0, 1));
    
              // Calculate the ar*(-ci) + cr*(ai)
    
      65534
              imagz = _mm_madd_epi16(x, y);
    
              // Interleave real and imaginary and then convert to float values
    
      131068
              ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
    
              // Normalize the floating point values
    
      65534
              ret = _mm_mul_ps(ret, invScalar);
    
              // Store the floating point values
    
              _mm_store_ps((float*)c, ret);
    
      65534
              c += 2;
    
              // Interleave real and imaginary and then convert to float values
    
      131068
              ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
    
              // Normalize the floating point values
    
      65534
              ret = _mm_mul_ps(ret, invScalar);
    
              // Store the floating point values
    
              _mm_store_ps((float*)c, ret);
    
      65534
              c += 2;
    
      65534
              a += 4;
    
      65534
              b += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          float* cFloatPtr = (float*)&cVector[number];
    
      2
          int8_t* a8Ptr = (int8_t*)&aVector[number];
    
      2
          int8_t* b8Ptr = (int8_t*)&bVector[number];
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              float aReal = (float)*a8Ptr++;
    
      6
              float aImag = (float)*a8Ptr++;
    
      6
              lv_32fc_t aVal = lv_cmake(aReal, aImag);
    
      6
              float bReal = (float)*b8Ptr++;
    
      6
              float bImag = (float)*b8Ptr++;
    
      6
              lv_32fc_t bVal = lv_cmake(bReal, -bImag);
    
      6
              lv_32fc_t temp = aVal * bVal;
    
      6
              *cFloatPtr++ = lv_creal(temp) / scalar;
    
      6
              *cFloatPtr++ = lv_cimag(temp) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
    
                                                       const lv_8sc_t* aVector,
    
                                                       const lv_8sc_t* bVector,
    
                                                       const float scalar,
    
                                                       unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          float* cPtr = (float*)cVector;
    
      2
          const float invScalar = 1.0 / scalar;
    
      2
          int8_t* a8Ptr = (int8_t*)aVector;
    
      2
          int8_t* b8Ptr = (int8_t*)bVector;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              float aReal = (float)*a8Ptr++;
    
      262142
              float aImag = (float)*a8Ptr++;
    
      262142
              lv_32fc_t aVal = lv_cmake(aReal, aImag);
    
      262142
              float bReal = (float)*b8Ptr++;
    
      262142
              float bImag = (float)*b8Ptr++;
    
      262142
              lv_32fc_t bVal = lv_cmake(bReal, -bImag);
    
      262142
              lv_32fc_t temp = aVal * bVal;
    
      262142
              *cPtr++ = (lv_creal(temp) * invScalar);
    
      262142
              *cPtr++ = (lv_cimag(temp) * invScalar);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
    
      #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
    
      #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
    
                                                      const lv_8sc_t* aVector,
    
                                                      const lv_8sc_t* bVector,
    
                                                      const float scalar,
    
                                                      unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int oneEigthPoints = num_points / 8;
    
          __m256i x, y, realz, imagz;
    
          __m256 ret, retlo, rethi;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_8sc_t* a = aVector;
    
      2
          const lv_8sc_t* b = bVector;
    
          __m256i conjugateSign =
    
      2
              _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
    
      2
          __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < oneEigthPoints; number++) {
    
              // Convert  8 bit values into 16 bit values
    
      65532
              x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
    
      65532
              y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
    
              // Calculate the ar*cr - ai*(-ci) portions
    
      32766
              realz = _mm256_madd_epi16(x, y);
    
              // Calculate the complex conjugate of the cr + ci j values
    
      32766
              y = _mm256_sign_epi16(y, conjugateSign);
    
              // Shift the order of the cr and ci values
    
      32766
              y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
    
                                         _MM_SHUFFLE(2, 3, 0, 1));
    
              // Calculate the ar*(-ci) + cr*(ai)
    
      32766
              imagz = _mm256_madd_epi16(x, y);
    
              // Interleave real and imaginary and then convert to float values
    
      65532
              retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
    
              // Normalize the floating point values
    
      32766
              retlo = _mm256_mul_ps(retlo, invScalar);
    
              // Interleave real and imaginary and then convert to float values
    
      65532
              rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
    
              // Normalize the floating point values
    
      32766
              rethi = _mm256_mul_ps(rethi, invScalar);
    
      32766
              ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
    
              _mm256_storeu_ps((float*)c, ret);
    
      32766
              c += 4;
    
      32766
              ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
    
              _mm256_storeu_ps((float*)c, ret);
    
      32766
              c += 4;
    
      32766
              a += 8;
    
      32766
              b += 8;
    
          }
    
      2
          number = oneEigthPoints * 8;
    
      2
          float* cFloatPtr = (float*)&cVector[number];
    
      2
          int8_t* a8Ptr = (int8_t*)&aVector[number];
    
      2
          int8_t* b8Ptr = (int8_t*)&bVector[number];
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              float aReal = (float)*a8Ptr++;
    
      14
              float aImag = (float)*a8Ptr++;
    
      14
              lv_32fc_t aVal = lv_cmake(aReal, aImag);
    
      14
              float bReal = (float)*b8Ptr++;
    
      14
              float bImag = (float)*b8Ptr++;
    
      14
              lv_32fc_t bVal = lv_cmake(bReal, -bImag);
    
      14
              lv_32fc_t temp = aVal * bVal;
    
      14
              *cFloatPtr++ = lv_creal(temp) / scalar;
    
      14
              *cFloatPtr++ = lv_cimag(temp) / scalar;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2*/
    
      #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_8ic_x2_s32f_multiply_conjugate_32fc
12			*
13			* \b Overview
14			*
15			* Multiplys the one complex vector with the complex conjugate of the
16			* second complex vector and stores their results in the third vector
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
21			* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
22			*
23			* \b Inputs
24			* \li aVector: One of the complex vectors to be multiplied.
25			* \li bVector: The complex vector which will be converted to complex conjugate and
26			* multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
27			* number of complex values in aVector and bVector to be multiplied together and stored
28			* into cVector.
29			*
30			* \b Outputs
31			* \li cVector: The complex vector where the results will be stored.
32			*
33			* \b Example
34			* \code
35			* int N = 10000;
36			*
37			* <FIXME>
38			*
39			* volk_8ic_x2_s32f_multiply_conjugate_32fc();
40			*
41			* \endcode
42			*/
43
44			#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45			#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46
47			#include <inttypes.h>
48			#include <stdio.h>
49			#include <volk/volk_complex.h>
50
51			#ifdef LV_HAVE_AVX2
52			#include <immintrin.h>
53
54			static inline void
55		2	volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56			const lv_8sc_t* aVector,
57			const lv_8sc_t* bVector,
58			const float scalar,
59			unsigned int num_points)
60			{
61		2	unsigned int number = 0;
62		2	const unsigned int oneEigthPoints = num_points / 8;
63
64			__m256i x, y, realz, imagz;
65			__m256 ret, retlo, rethi;
66		2	lv_32fc_t* c = cVector;
67		2	const lv_8sc_t* a = aVector;
68		2	const lv_8sc_t* b = bVector;
69			__m256i conjugateSign =
70		2	_mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71
72		2	__m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73
74	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < oneEigthPoints; number++) {
75			// Convert 8 bit values into 16 bit values
76		65532	x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77		65532	y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78
79			// Calculate the arcr - ai(-ci) portions
80		32766	realz = _mm256_madd_epi16(x, y);
81
82			// Calculate the complex conjugate of the cr + ci j values
83		32766	y = _mm256_sign_epi16(y, conjugateSign);
84
85			// Shift the order of the cr and ci values
86		32766	y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87			_MM_SHUFFLE(2, 3, 0, 1));
88
89			// Calculate the ar(-ci) + cr(ai)
90		32766	imagz = _mm256_madd_epi16(x, y);
91
92			// Interleave real and imaginary and then convert to float values
93		65532	retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94
95			// Normalize the floating point values
96		32766	retlo = _mm256_mul_ps(retlo, invScalar);
97
98			// Interleave real and imaginary and then convert to float values
99		65532	rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100
101			// Normalize the floating point values
102		32766	rethi = _mm256_mul_ps(rethi, invScalar);
103
104		32766	ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105			_mm256_store_ps((float*)c, ret);
106		32766	c += 4;
107
108		32766	ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109			_mm256_store_ps((float*)c, ret);
110		32766	c += 4;
111
112		32766	a += 8;
113		32766	b += 8;
114			}
115
116		2	number = oneEigthPoints * 8;
117		2	float* cFloatPtr = (float*)&cVector[number];
118		2	int8_t* a8Ptr = (int8_t*)&aVector[number];
119		2	int8_t* b8Ptr = (int8_t*)&bVector[number];
120	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
121		14	float aReal = (float)*a8Ptr++;
122		14	float aImag = (float)*a8Ptr++;
123		14	lv_32fc_t aVal = lv_cmake(aReal, aImag);
124		14	float bReal = (float)*b8Ptr++;
125		14	float bImag = (float)*b8Ptr++;
126		14	lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127		14	lv_32fc_t temp = aVal * bVal;
128
129		14	*cFloatPtr++ = lv_creal(temp) / scalar;
130		14	*cFloatPtr++ = lv_cimag(temp) / scalar;
131			}
132		2	}
133			#endif /* LV_HAVE_AVX2*/
134
135
136			#ifdef LV_HAVE_SSE4_1
137			#include <smmintrin.h>
138
139			static inline void
140		2	volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141			const lv_8sc_t* aVector,
142			const lv_8sc_t* bVector,
143			const float scalar,
144			unsigned int num_points)
145			{
146		2	unsigned int number = 0;
147		2	const unsigned int quarterPoints = num_points / 4;
148
149			__m128i x, y, realz, imagz;
150			__m128 ret;
151		2	lv_32fc_t* c = cVector;
152		2	const lv_8sc_t* a = aVector;
153		2	const lv_8sc_t* b = bVector;
154		2	__m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155
156		2	__m128 invScalar = _mm_set_ps1(1.0 / scalar);
157
158	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
159			// Convert into 8 bit values into 16 bit values
160		131068	x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161		131068	y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
162
163			// Calculate the arcr - ai(-ci) portions
164		65534	realz = _mm_madd_epi16(x, y);
165
166			// Calculate the complex conjugate of the cr + ci j values
167		65534	y = _mm_sign_epi16(y, conjugateSign);
168
169			// Shift the order of the cr and ci values
170		65534	y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171			_MM_SHUFFLE(2, 3, 0, 1));
172
173			// Calculate the ar(-ci) + cr(ai)
174		65534	imagz = _mm_madd_epi16(x, y);
175
176			// Interleave real and imaginary and then convert to float values
177		131068	ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178
179			// Normalize the floating point values
180		65534	ret = _mm_mul_ps(ret, invScalar);
181
182			// Store the floating point values
183			_mm_store_ps((float*)c, ret);
184		65534	c += 2;
185
186			// Interleave real and imaginary and then convert to float values
187		131068	ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188
189			// Normalize the floating point values
190		65534	ret = _mm_mul_ps(ret, invScalar);
191
192			// Store the floating point values
193			_mm_store_ps((float*)c, ret);
194		65534	c += 2;
195
196		65534	a += 4;
197		65534	b += 4;
198			}
199
200		2	number = quarterPoints * 4;
201		2	float* cFloatPtr = (float*)&cVector[number];
202		2	int8_t* a8Ptr = (int8_t*)&aVector[number];
203		2	int8_t* b8Ptr = (int8_t*)&bVector[number];
204	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
205		6	float aReal = (float)*a8Ptr++;
206		6	float aImag = (float)*a8Ptr++;
207		6	lv_32fc_t aVal = lv_cmake(aReal, aImag);
208		6	float bReal = (float)*b8Ptr++;
209		6	float bImag = (float)*b8Ptr++;
210		6	lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211		6	lv_32fc_t temp = aVal * bVal;
212
213		6	*cFloatPtr++ = lv_creal(temp) / scalar;
214		6	*cFloatPtr++ = lv_cimag(temp) / scalar;
215			}
216		2	}
217			#endif /* LV_HAVE_SSE4_1 */
218
219
220			#ifdef LV_HAVE_GENERIC
221
222			static inline void
223		2	volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
224			const lv_8sc_t* aVector,
225			const lv_8sc_t* bVector,
226			const float scalar,
227			unsigned int num_points)
228			{
229		2	unsigned int number = 0;
230		2	float* cPtr = (float*)cVector;
231		2	const float invScalar = 1.0 / scalar;
232		2	int8_t* a8Ptr = (int8_t*)aVector;
233		2	int8_t* b8Ptr = (int8_t*)bVector;
234	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
235		262142	float aReal = (float)*a8Ptr++;
236		262142	float aImag = (float)*a8Ptr++;
237		262142	lv_32fc_t aVal = lv_cmake(aReal, aImag);
238		262142	float bReal = (float)*b8Ptr++;
239		262142	float bImag = (float)*b8Ptr++;
240		262142	lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241		262142	lv_32fc_t temp = aVal * bVal;
242
243		262142	cPtr++ = (lv_creal(temp) invScalar);
244		262142	cPtr++ = (lv_cimag(temp) invScalar);
245			}
246		2	}
247			#endif /* LV_HAVE_GENERIC */
248
249
250			#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
251
252			#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253			#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
254
255			#include <inttypes.h>
256			#include <stdio.h>
257			#include <volk/volk_complex.h>
258
259			#ifdef LV_HAVE_AVX2
260			#include <immintrin.h>
261
262			static inline void
263		2	volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
264			const lv_8sc_t* aVector,
265			const lv_8sc_t* bVector,
266			const float scalar,
267			unsigned int num_points)
268			{
269		2	unsigned int number = 0;
270		2	const unsigned int oneEigthPoints = num_points / 8;
271
272			__m256i x, y, realz, imagz;
273			__m256 ret, retlo, rethi;
274		2	lv_32fc_t* c = cVector;
275		2	const lv_8sc_t* a = aVector;
276		2	const lv_8sc_t* b = bVector;
277			__m256i conjugateSign =
278		2	_mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
279
280		2	__m256 invScalar = _mm256_set1_ps(1.0 / scalar);
281
282	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < oneEigthPoints; number++) {
283			// Convert 8 bit values into 16 bit values
284		65532	x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285		65532	y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
286
287			// Calculate the arcr - ai(-ci) portions
288		32766	realz = _mm256_madd_epi16(x, y);
289
290			// Calculate the complex conjugate of the cr + ci j values
291		32766	y = _mm256_sign_epi16(y, conjugateSign);
292
293			// Shift the order of the cr and ci values
294		32766	y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295			_MM_SHUFFLE(2, 3, 0, 1));
296
297			// Calculate the ar(-ci) + cr(ai)
298		32766	imagz = _mm256_madd_epi16(x, y);
299
300			// Interleave real and imaginary and then convert to float values
301		65532	retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
302
303			// Normalize the floating point values
304		32766	retlo = _mm256_mul_ps(retlo, invScalar);
305
306			// Interleave real and imaginary and then convert to float values
307		65532	rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
308
309			// Normalize the floating point values
310		32766	rethi = _mm256_mul_ps(rethi, invScalar);
311
312		32766	ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313			_mm256_storeu_ps((float*)c, ret);
314		32766	c += 4;
315
316		32766	ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317			_mm256_storeu_ps((float*)c, ret);
318		32766	c += 4;
319
320		32766	a += 8;
321		32766	b += 8;
322			}
323
324		2	number = oneEigthPoints * 8;
325		2	float* cFloatPtr = (float*)&cVector[number];
326		2	int8_t* a8Ptr = (int8_t*)&aVector[number];
327		2	int8_t* b8Ptr = (int8_t*)&bVector[number];
328	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
329		14	float aReal = (float)*a8Ptr++;
330		14	float aImag = (float)*a8Ptr++;
331		14	lv_32fc_t aVal = lv_cmake(aReal, aImag);
332		14	float bReal = (float)*b8Ptr++;
333		14	float bImag = (float)*b8Ptr++;
334		14	lv_32fc_t bVal = lv_cmake(bReal, -bImag);
335		14	lv_32fc_t temp = aVal * bVal;
336
337		14	*cFloatPtr++ = lv_creal(temp) / scalar;
338		14	*cFloatPtr++ = lv_cimag(temp) / scalar;
339			}
340		2	}
341			#endif /* LV_HAVE_AVX2*/
342
343
344			#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
345