GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_32f_multiply_32fc.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	61	61	100.0%
Functions:	4	4	100.0%
Branches:	10	10	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_32f_multiply_32fc
    
       *
    
       * \b Overview
    
       *
    
       * Multiplies a complex vector by a floating point vector and returns
    
       * the complex result.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
    
       * float* bVector, unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The input vector of complex floats.
    
       * \li bVector: The input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output vector complex floats.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * volk_32fc_32f_multiply_32fc();
    
       *
    
       * volk_free(x);
    
       * volk_free(t);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
    
      #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
    
                                                           const lv_32fc_t* aVector,
    
                                                           const float* bVector,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
    
      2
          __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal1 = _mm256_load_ps((float*)aPtr);
    
      32766
              aPtr += 4;
    
      32766
              aVal2 = _mm256_load_ps((float*)aPtr);
    
      32766
              aPtr += 4;
    
      32766
              bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
    
      32766
              bPtr += 8;
    
      32766
              bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
    
      32766
              bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
    
      32766
              bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
    
      32766
              bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
    
      32766
              cVal1 = _mm256_mul_ps(aVal1, bVal1);
    
      32766
              cVal2 = _mm256_mul_ps(aVal2, bVal2);
    
              _mm256_store_ps((float*)cPtr,
    
                              cVal1); // Store the results back into the C container
    
      32766
              cPtr += 4;
    
              _mm256_store_ps((float*)cPtr,
    
                              cVal2); // Store the results back into the C container
    
      32766
              cPtr += 4;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; ++number) {
    
      14
              *cPtr++ = (*aPtr++) * (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
    
                                                           const lv_32fc_t* aVector,
    
                                                           const float* bVector,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal1 = _mm_load_ps((const float*)aPtr);
    
      65534
              aPtr += 2;
    
      65534
              aVal2 = _mm_load_ps((const float*)aPtr);
    
      65534
              aPtr += 2;
    
      65534
              bVal = _mm_load_ps(bPtr);
    
      65534
              bPtr += 4;
    
      65534
              bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
    
      65534
              bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
    
      65534
              cVal = _mm_mul_ps(aVal1, bVal1);
    
              _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
    
      65534
              cPtr += 2;
    
      65534
              cVal = _mm_mul_ps(aVal2, bVal2);
    
              _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
    
      65534
              cPtr += 2;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = (*aPtr++) * (*bPtr);
    
      6
              bPtr++;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
    
                                                             const lv_32fc_t* aVector,
    
                                                             const float* bVector,
    
                                                             unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) * (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
    
                                                          const lv_32fc_t* aVector,
    
                                                          const float* bVector,
    
                                                          unsigned int num_points)
    
      {
    
          lv_32fc_t* cPtr = cVector;
    
          const lv_32fc_t* aPtr = aVector;
    
          const float* bPtr = bVector;
    
          unsigned int number = 0;
    
          unsigned int quarter_points = num_points / 4;
    
          float32x4x2_t inputVector, outputVector;
    
          float32x4_t tapsVector;
    
          for (number = 0; number < quarter_points; number++) {
    
              inputVector = vld2q_f32((float*)aPtr);
    
              tapsVector = vld1q_f32(bPtr);
    
              outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
    
              outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
    
              vst2q_f32((float*)cPtr, outputVector);
    
              aPtr += 4;
    
              bPtr += 4;
    
              cPtr += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *cPtr++ = (*aPtr++) * (*bPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
    
                                                         const lv_32fc_t* aVector,
    
                                                         const float* bVector,
    
                                                         unsigned int num_points);
    
      2
      static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
    
                                                           const lv_32fc_t* aVector,
    
                                                           const float* bVector,
    
                                                           unsigned int num_points)
    
      {
    
      2
          volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_32f_multiply_32fc
12			*
13			* \b Overview
14			*
15			* Multiplies a complex vector by a floating point vector and returns
16			* the complex result.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
21			* float* bVector, unsigned int num_points); \endcode
22			*
23			* \b Inputs
24			* \li aVector: The input vector of complex floats.
25			* \li bVector: The input vector of floats.
26			* \li num_points: The number of data points.
27			*
28			* \b Outputs
29			* \li outputVector: The output vector complex floats.
30			*
31			* \b Example
32			* \code
33			* int N = 10000;
34			*
35			* volk_32fc_32f_multiply_32fc();
36			*
37			* volk_free(x);
38			* volk_free(t);
39			* \endcode
40			*/
41
42			#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
43			#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
44
45			#include <inttypes.h>
46			#include <stdio.h>
47
48			#ifdef LV_HAVE_AVX
49			#include <immintrin.h>
50
51		2	static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
52			const lv_32fc_t* aVector,
53			const float* bVector,
54			unsigned int num_points)
55			{
56		2	unsigned int number = 0;
57		2	const unsigned int eighthPoints = num_points / 8;
58
59		2	lv_32fc_t* cPtr = cVector;
60		2	const lv_32fc_t* aPtr = aVector;
61		2	const float* bPtr = bVector;
62
63			__m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
64
65		2	__m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
66
67	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
68
69		32766	aVal1 = _mm256_load_ps((float*)aPtr);
70		32766	aPtr += 4;
71
72		32766	aVal2 = _mm256_load_ps((float*)aPtr);
73		32766	aPtr += 4;
74
75		32766	bVal = _mm256_load_ps(bPtr); // b0\|b1\|b2\|b3\|b4\|b5\|b6\|b7
76		32766	bPtr += 8;
77
78		32766	bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0\|b1\|b2\|b3\|b0\|b1\|b2\|b3
79		32766	bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4\|b5\|b6\|b7\|b4\|b5\|b6\|b7
80
81		32766	bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0\|b0\|b1\|b1\|b2\|b2\|b3\|b3
82		32766	bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4\|b4\|b5\|b5\|b6\|b6\|b7\|b7
83
84		32766	cVal1 = _mm256_mul_ps(aVal1, bVal1);
85		32766	cVal2 = _mm256_mul_ps(aVal2, bVal2);
86
87			_mm256_store_ps((float*)cPtr,
88			cVal1); // Store the results back into the C container
89		32766	cPtr += 4;
90
91			_mm256_store_ps((float*)cPtr,
92			cVal2); // Store the results back into the C container
93		32766	cPtr += 4;
94			}
95
96		2	number = eighthPoints * 8;
97	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; ++number) {
98		14	cPtr++ = (aPtr++) * (*bPtr++);
99			}
100		2	}
101			#endif /* LV_HAVE_AVX */
102
103
104			#ifdef LV_HAVE_SSE
105			#include <xmmintrin.h>
106
107		2	static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
108			const lv_32fc_t* aVector,
109			const float* bVector,
110			unsigned int num_points)
111			{
112		2	unsigned int number = 0;
113		2	const unsigned int quarterPoints = num_points / 4;
114
115		2	lv_32fc_t* cPtr = cVector;
116		2	const lv_32fc_t* aPtr = aVector;
117		2	const float* bPtr = bVector;
118
119			__m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
120	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
121
122		65534	aVal1 = _mm_load_ps((const float*)aPtr);
123		65534	aPtr += 2;
124
125		65534	aVal2 = _mm_load_ps((const float*)aPtr);
126		65534	aPtr += 2;
127
128		65534	bVal = _mm_load_ps(bPtr);
129		65534	bPtr += 4;
130
131		65534	bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
132		65534	bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
133
134		65534	cVal = _mm_mul_ps(aVal1, bVal1);
135
136			_mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
137		65534	cPtr += 2;
138
139		65534	cVal = _mm_mul_ps(aVal2, bVal2);
140
141			_mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
142
143		65534	cPtr += 2;
144			}
145
146		2	number = quarterPoints * 4;
147	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
148		6	cPtr++ = (aPtr++) * (*bPtr);
149		6	bPtr++;
150			}
151		2	}
152			#endif /* LV_HAVE_SSE */
153
154
155			#ifdef LV_HAVE_GENERIC
156
157		2	static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
158			const lv_32fc_t* aVector,
159			const float* bVector,
160			unsigned int num_points)
161			{
162		2	lv_32fc_t* cPtr = cVector;
163		2	const lv_32fc_t* aPtr = aVector;
164		2	const float* bPtr = bVector;
165		2	unsigned int number = 0;
166
167	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
168		262142	cPtr++ = (aPtr++) * (*bPtr++);
169			}
170		2	}
171			#endif /* LV_HAVE_GENERIC */
172
173
174			#ifdef LV_HAVE_NEON
175			#include <arm_neon.h>
176
177			static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
178			const lv_32fc_t* aVector,
179			const float* bVector,
180			unsigned int num_points)
181			{
182			lv_32fc_t* cPtr = cVector;
183			const lv_32fc_t* aPtr = aVector;
184			const float* bPtr = bVector;
185			unsigned int number = 0;
186			unsigned int quarter_points = num_points / 4;
187
188			float32x4x2_t inputVector, outputVector;
189			float32x4_t tapsVector;
190			for (number = 0; number < quarter_points; number++) {
191			inputVector = vld2q_f32((float*)aPtr);
192			tapsVector = vld1q_f32(bPtr);
193
194			outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
195			outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
196
197			vst2q_f32((float*)cPtr, outputVector);
198			aPtr += 4;
199			bPtr += 4;
200			cPtr += 4;
201			}
202
203			for (number = quarter_points * 4; number < num_points; number++) {
204			cPtr++ = (aPtr++) * (*bPtr++);
205			}
206			}
207			#endif /* LV_HAVE_NEON */
208
209
210			#ifdef LV_HAVE_ORC
211
212			extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
213			const lv_32fc_t* aVector,
214			const float* bVector,
215			unsigned int num_points);
216
217		2	static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
218			const lv_32fc_t* aVector,
219			const float* bVector,
220			unsigned int num_points)
221			{
222		2	volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
223		2	}
224
225			#endif /* LV_HAVE_GENERIC */
226
227
228			#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
229