GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	82	82	100.0%
Functions:	6	6	100.0%
Branches:	18	20	90.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_x2_multiply_conjugate_32fc
    
       *
    
       * \b Overview
    
       *
    
       * Multiplies a complex vector by the conjugate of a second complex
    
       * vector and returns the complex result.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
    
       * const lv_32fc_t* bVector, unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The first input vector of complex floats.
    
       * \li bVector: The second input vector of complex floats that is conjugated.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output vector complex floats.
    
       *
    
       * \b Example
    
       * Calculate mag^2 of a signal using x * conj(x).
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   lv_32fc_t* sig_1  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *
    
       *   float delta = 2.f*M_PI / (float)N;
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       float real_1 = std::cos(0.3f * (float)ii);
    
       *       float imag_1 = std::sin(0.3f * (float)ii);
    
       *       sig_1[ii] = lv_cmake(real_1, imag_1);
    
       *   }
    
       *
    
       *   volk_32fc_x2_multiply_conjugate_32fc(out, sig_1, sig_1, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
    
       *   }
    
       *   printf("\n");
    
       *
    
       *   volk_free(sig_1);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
    
      #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
    
      #include <float.h>
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector,
    
                                                                    const lv_32fc_t* aVector,
    
                                                                    const lv_32fc_t* bVector,
    
                                                                    unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m256 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              x = _mm256_loadu_ps(
    
                  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
    
      65534
              y = _mm256_loadu_ps(
    
                  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
    
      65534
              z = _mm256_complexconjugatemul_ps(x, y);
    
              _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) * lv_conj(*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <volk/volk_sse3_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
    
                                                                     const lv_32fc_t* aVector,
    
                                                                     const lv_32fc_t* bVector,
    
                                                                     unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int halfPoints = num_points / 2;
    
          __m128 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.

      131072
          for (; number < halfPoints; number++) {
    
      131070
              x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
      131070
              y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      131070
              z = _mm_complexconjugatemul_ps(x, y);
    
              _mm_storeu_ps((float*)c, z); // Store the results back into the C container
    
      131070
              a += 2;
    
      131070
              b += 2;
    
      131070
              c += 2;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if ((num_points % 2) != 0) {
    
      2
              *c = (*a) * lv_conj(*b);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
    
                                                                      const lv_32fc_t* aVector,
    
                                                                      const lv_32fc_t* bVector,
    
                                                                      unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const lv_32fc_t* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
    
      #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
    
      #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
    
      #include <float.h>
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector,
    
                                                                    const lv_32fc_t* aVector,
    
                                                                    const lv_32fc_t* bVector,
    
                                                                    unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m256 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
    
      65534
              y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
    
      65534
              z = _mm256_complexconjugatemul_ps(x, y);
    
              _mm256_store_ps((float*)c, z); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) * lv_conj(*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <volk/volk_sse3_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
    
                                                                     const lv_32fc_t* aVector,
    
                                                                     const lv_32fc_t* bVector,
    
                                                                     unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int halfPoints = num_points / 2;
    
          __m128 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.

      131072
          for (; number < halfPoints; number++) {
    
      131070
              x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
      131070
              y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      131070
              z = _mm_complexconjugatemul_ps(x, y);
    
              _mm_store_ps((float*)c, z); // Store the results back into the C container
    
      131070
              a += 2;
    
      131070
              b += 2;
    
      131070
              c += 2;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if ((num_points % 2) != 0) {
    
      2
              *c = (*a) * lv_conj(*b);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
    
                                                                   const lv_32fc_t* aVector,
    
                                                                   const lv_32fc_t* bVector,
    
                                                                   unsigned int num_points)
    
      {
    
          lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
    
          lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
    
          unsigned int quarter_points = num_points / 4;
    
          float32x4x2_t a_val, b_val, c_val;
    
          float32x4x2_t tmp_real, tmp_imag;
    
          unsigned int number = 0;
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
              b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
              b_val.val[1] = vnegq_f32(b_val.val[1]);
    
              __VOLK_PREFETCH(a_ptr + 4);
    
              __VOLK_PREFETCH(b_ptr + 4);
    
              // multiply the real*real and imag*imag to get real result
    
              // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
    
              tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
    
              // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
    
              tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
    
              // Multiply cross terms to get the imaginary result
    
              // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
    
              tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
    
              // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
    
              tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
    
              // store the results
    
              c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
    
              c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
    
              vst2q_f32((float*)cVector, c_val);
    
              a_ptr += 4;
    
              b_ptr += 4;
    
              cVector += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *cVector++ = (*a_ptr++) * conj(*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector,
    
                                                     const lv_32fc_t* aVector,
    
                                                     const lv_32fc_t* bVector,
    
                                                     unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const lv_32fc_t* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_x2_multiply_conjugate_32fc
12			*
13			* \b Overview
14			*
15			* Multiplies a complex vector by the conjugate of a second complex
16			* vector and returns the complex result.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
21			* const lv_32fc_t* bVector, unsigned int num_points); \endcode
22			*
23			* \b Inputs
24			* \li aVector: The first input vector of complex floats.
25			* \li bVector: The second input vector of complex floats that is conjugated.
26			* \li num_points: The number of data points.
27			*
28			* \b Outputs
29			* \li outputVector: The output vector complex floats.
30			*
31			* \b Example
32			* Calculate mag^2 of a signal using x * conj(x).
33			* \code
34			* int N = 10;
35			* unsigned int alignment = volk_get_alignment();
36			* lv_32fc_t* sig_1 = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
37			* lv_32fc_t* out = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
38			*
39			* float delta = 2.f*M_PI / (float)N;
40			* for(unsigned int ii = 0; ii < N; ++ii){
41			* float real_1 = std::cos(0.3f * (float)ii);
42			* float imag_1 = std::sin(0.3f * (float)ii);
43			* sig_1[ii] = lv_cmake(real_1, imag_1);
44			* }
45			*
46			* volk_32fc_x2_multiply_conjugate_32fc(out, sig_1, sig_1, N);
47			*
48			* for(unsigned int ii = 0; ii < N; ++ii){
49			* printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
50			* }
51			* printf("\n");
52			*
53			* volk_free(sig_1);
54			* volk_free(out);
55			* \endcode
56			*/
57
58			#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
59			#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
60
61			#include <float.h>
62			#include <inttypes.h>
63			#include <stdio.h>
64			#include <volk/volk_complex.h>
65
66			#ifdef LV_HAVE_AVX
67			#include <immintrin.h>
68			#include <volk/volk_avx_intrinsics.h>
69
70		2	static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector,
71			const lv_32fc_t* aVector,
72			const lv_32fc_t* bVector,
73			unsigned int num_points)
74			{
75		2	unsigned int number = 0;
76		2	const unsigned int quarterPoints = num_points / 4;
77
78			__m256 x, y, z;
79		2	lv_32fc_t* c = cVector;
80		2	const lv_32fc_t* a = aVector;
81		2	const lv_32fc_t* b = bVector;
82
83	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
84		65534	x = _mm256_loadu_ps(
85			(float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
86		65534	y = _mm256_loadu_ps(
87			(float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
88		65534	z = _mm256_complexconjugatemul_ps(x, y);
89			_mm256_storeu_ps((float*)c, z); // Store the results back into the C container
90
91		65534	a += 4;
92		65534	b += 4;
93		65534	c += 4;
94			}
95
96		2	number = quarterPoints * 4;
97
98	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
99		6	c++ = (a++) * lv_conj(*b++);
100			}
101		2	}
102			#endif /* LV_HAVE_AVX */
103
104
105			#ifdef LV_HAVE_SSE3
106			#include <pmmintrin.h>
107			#include <volk/volk_sse3_intrinsics.h>
108
109		2	static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
110			const lv_32fc_t* aVector,
111			const lv_32fc_t* bVector,
112			unsigned int num_points)
113			{
114		2	unsigned int number = 0;
115		2	const unsigned int halfPoints = num_points / 2;
116
117			__m128 x, y, z;
118		2	lv_32fc_t* c = cVector;
119		2	const lv_32fc_t* a = aVector;
120		2	const lv_32fc_t* b = bVector;
121
122	2/2 ✓ Branch 0 taken 131070 times. ✓ Branch 1 taken 2 times.	131072	for (; number < halfPoints; number++) {
123		131070	x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
124		131070	y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
125		131070	z = _mm_complexconjugatemul_ps(x, y);
126			_mm_storeu_ps((float*)c, z); // Store the results back into the C container
127
128		131070	a += 2;
129		131070	b += 2;
130		131070	c += 2;
131			}
132
133	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if ((num_points % 2) != 0) {
134		2	c = (a) * lv_conj(*b);
135			}
136		2	}
137			#endif /* LV_HAVE_SSE */
138
139
140			#ifdef LV_HAVE_GENERIC
141
142		2	static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
143			const lv_32fc_t* aVector,
144			const lv_32fc_t* bVector,
145			unsigned int num_points)
146			{
147		2	lv_32fc_t* cPtr = cVector;
148		2	const lv_32fc_t* aPtr = aVector;
149		2	const lv_32fc_t* bPtr = bVector;
150		2	unsigned int number = 0;
151
152	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
153		262142	cPtr++ = (aPtr++) * lv_conj(*bPtr++);
154			}
155		2	}
156			#endif /* LV_HAVE_GENERIC */
157
158
159			#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
160			#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
161			#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
162
163			#include <float.h>
164			#include <inttypes.h>
165			#include <stdio.h>
166			#include <volk/volk_complex.h>
167
168			#ifdef LV_HAVE_AVX
169			#include <immintrin.h>
170			#include <volk/volk_avx_intrinsics.h>
171
172		2	static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector,
173			const lv_32fc_t* aVector,
174			const lv_32fc_t* bVector,
175			unsigned int num_points)
176			{
177		2	unsigned int number = 0;
178		2	const unsigned int quarterPoints = num_points / 4;
179
180			__m256 x, y, z;
181		2	lv_32fc_t* c = cVector;
182		2	const lv_32fc_t* a = aVector;
183		2	const lv_32fc_t* b = bVector;
184
185	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
186		65534	x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
187		65534	y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
188		65534	z = _mm256_complexconjugatemul_ps(x, y);
189			_mm256_store_ps((float*)c, z); // Store the results back into the C container
190
191		65534	a += 4;
192		65534	b += 4;
193		65534	c += 4;
194			}
195
196		2	number = quarterPoints * 4;
197
198	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
199		6	c++ = (a++) * lv_conj(*b++);
200			}
201		2	}
202			#endif /* LV_HAVE_AVX */
203
204
205			#ifdef LV_HAVE_SSE3
206			#include <pmmintrin.h>
207			#include <volk/volk_sse3_intrinsics.h>
208
209		2	static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
210			const lv_32fc_t* aVector,
211			const lv_32fc_t* bVector,
212			unsigned int num_points)
213			{
214		2	unsigned int number = 0;
215		2	const unsigned int halfPoints = num_points / 2;
216
217			__m128 x, y, z;
218		2	lv_32fc_t* c = cVector;
219		2	const lv_32fc_t* a = aVector;
220		2	const lv_32fc_t* b = bVector;
221
222	2/2 ✓ Branch 0 taken 131070 times. ✓ Branch 1 taken 2 times.	131072	for (; number < halfPoints; number++) {
223		131070	x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
224		131070	y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
225		131070	z = _mm_complexconjugatemul_ps(x, y);
226			_mm_store_ps((float*)c, z); // Store the results back into the C container
227
228		131070	a += 2;
229		131070	b += 2;
230		131070	c += 2;
231			}
232
233	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if ((num_points % 2) != 0) {
234		2	c = (a) * lv_conj(*b);
235			}
236		2	}
237			#endif /* LV_HAVE_SSE */
238
239
240			#ifdef LV_HAVE_NEON
241			#include <arm_neon.h>
242
243			static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
244			const lv_32fc_t* aVector,
245			const lv_32fc_t* bVector,
246			unsigned int num_points)
247			{
248			lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
249			lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
250			unsigned int quarter_points = num_points / 4;
251			float32x4x2_t a_val, b_val, c_val;
252			float32x4x2_t tmp_real, tmp_imag;
253			unsigned int number = 0;
254
255			for (number = 0; number < quarter_points; ++number) {
256			a_val = vld2q_f32((float*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
257			b_val = vld2q_f32((float*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
258			b_val.val[1] = vnegq_f32(b_val.val[1]);
259			__VOLK_PREFETCH(a_ptr + 4);
260			__VOLK_PREFETCH(b_ptr + 4);
261
262			// multiply the realreal and imagimag to get real result
263			// a0rb0r\|a1rb1r\|a2rb2r\|a3rb3r
264			tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
265			// a0ib0i\|a1ib1i\|a2ib2i\|a3ib3i
266			tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
267
268			// Multiply cross terms to get the imaginary result
269			// a0rb0i\|a1rb1i\|a2rb2i\|a3rb3i
270			tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
271			// a0ib0r\|a1ib1r\|a2ib2r\|a3ib3r
272			tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
273
274			// store the results
275			c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
276			c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
277			vst2q_f32((float*)cVector, c_val);
278
279			a_ptr += 4;
280			b_ptr += 4;
281			cVector += 4;
282			}
283
284			for (number = quarter_points * 4; number < num_points; number++) {
285			cVector++ = (a_ptr++) * conj(*b_ptr++);
286			}
287			}
288			#endif /* LV_HAVE_NEON */
289
290
291			#ifdef LV_HAVE_GENERIC
292
293			static inline void
294		2	volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector,
295			const lv_32fc_t* aVector,
296			const lv_32fc_t* bVector,
297			unsigned int num_points)
298			{
299		2	lv_32fc_t* cPtr = cVector;
300		2	const lv_32fc_t* aPtr = aVector;
301		2	const lv_32fc_t* bPtr = bVector;
302		2	unsigned int number = 0;
303
304	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
305		262142	cPtr++ = (aPtr++) * lv_conj(*bPtr++);
306			}
307		2	}
308			#endif /* LV_HAVE_GENERIC */
309
310
311			#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
312