GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_64f_add_64f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	56	56	100.0%
Functions:	3	3	100.0%
Branches:	10	10	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2018 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_64f_add_64f
    
       *
    
       * \b Overview
    
       *
    
       * Adds two input vectors and store result as a double-precision vectors. One
    
       * of the input vector is defined as a single precision floating point, so
    
       * upcasting is performed before the addition
    
       *
    
       * c[i] = a[i] + b[i]
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_64f_add_64f(double* cVector, const double* aVector, const
    
       * double* bVector, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: First input vector.
    
       * \li bVector: Second input vector.
    
       * \li num_points: The number of values in both input vectors.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output vector.
    
       *
    
       * \b Example
    
       * add elements of an increasing vector by those of a decreasing vector.
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
    
       *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = (double)ii;
    
       *       decreasing[ii] = 10.f - (double)ii;
    
       *   }
    
       *
    
       *   volk_32f_64f_add_64f(out, increasing, decreasing, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %1.2F\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(decreasing);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_64f_add_64f_H
    
      #define INCLUDED_volk_32f_64f_add_64f_H
    
      #include <inttypes.h>
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_64f_add_64f_generic(double* cVector,
    
                                                      const float* aVector,
    
                                                      const double* bVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          double* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const double* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEONV8
    
      #include <arm_neon.h>
    
      static inline void volk_32f_64f_add_64f_neon(double* cVector,
    
                                                   const float* aVector,
    
                                                   const double* bVector,
    
                                                   unsigned int num_points)
    
      {
    
          unsigned int number = 0;
    
          const unsigned int half_points = num_points / 2;
    
          double* cPtr = cVector;
    
          const float* aPtr = aVector;
    
          const double* bPtr = bVector;
    
          float64x2_t aVal, bVal, cVal;
    
          float32x2_t aVal1;
    
          for (number = 0; number < half_points; number++) {
    
              // Load in to NEON registers
    
              aVal1 = vld1_f32(aPtr);
    
              bVal = vld1q_f64(bPtr);
    
              __VOLK_PREFETCH(aPtr + 2);
    
              __VOLK_PREFETCH(bPtr + 2);
    
              aPtr += 2; // q uses quadwords, 4 floats per vadd
    
              bPtr += 2;
    
              // Vector conversion
    
              aVal = vcvt_f64_f32(aVal1);
    
              // vector add
    
              cVal = vaddq_f64(aVal, bVal);
    
              // Store the results back into the C container
    
              vst1q_f64(cPtr, cVal);
    
              cPtr += 2;
    
          }
    
          number = half_points * 2; // should be = num_points
    
          for (; number < num_points; number++) {
    
              *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEONV8 */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
    
                                                    const float* aVector,
    
                                                    const double* bVector,
    
                                                    unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighth_points = num_points / 8;
    
      2
          double* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const double* bPtr = bVector;
    
          __m256 aVal;
    
          __m128 aVal1, aVal2;
    
          __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighth_points; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              bVal1 = _mm256_loadu_pd(bPtr);
    
      32766
              bVal2 = _mm256_loadu_pd(bPtr + 4);
    
      32766
              aVal1 = _mm256_extractf128_ps(aVal, 0);
    
      32766
              aVal2 = _mm256_extractf128_ps(aVal, 1);
    
      32766
              aDbl1 = _mm256_cvtps_pd(aVal1);
    
      32766
              aDbl2 = _mm256_cvtps_pd(aVal2);
    
      32766
              cVal1 = _mm256_add_pd(aDbl1, bVal1);
    
      32766
              cVal2 = _mm256_add_pd(aDbl2, bVal2);
    
              _mm256_storeu_pd(cPtr,
    
                               cVal1); // Store the results back into the C container
    
      32766
              _mm256_storeu_pd(cPtr + 4,
    
                               cVal2); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
    
                                                    const float* aVector,
    
                                                    const double* bVector,
    
                                                    unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighth_points = num_points / 8;
    
      2
          double* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const double* bPtr = bVector;
    
          __m256 aVal;
    
          __m128 aVal1, aVal2;
    
          __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighth_points; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              bVal1 = _mm256_load_pd(bPtr);
    
      32766
              bVal2 = _mm256_load_pd(bPtr + 4);
    
      32766
              aVal1 = _mm256_extractf128_ps(aVal, 0);
    
      32766
              aVal2 = _mm256_extractf128_ps(aVal, 1);
    
      32766
              aDbl1 = _mm256_cvtps_pd(aVal1);
    
      32766
              aDbl2 = _mm256_cvtps_pd(aVal2);
    
      32766
              cVal1 = _mm256_add_pd(aDbl1, bVal1);
    
      32766
              cVal2 = _mm256_add_pd(aDbl2, bVal2);
    
              _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
    
      32766
              _mm256_store_pd(cPtr + 4,
    
                              cVal2); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2018 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_64f_add_64f
12			*
13			* \b Overview
14			*
15			* Adds two input vectors and store result as a double-precision vectors. One
16			* of the input vector is defined as a single precision floating point, so
17			* upcasting is performed before the addition
18			*
19			* c[i] = a[i] + b[i]
20			*
21			* <b>Dispatcher Prototype</b>
22			* \code
23			* void volk_32f_64f_add_64f(double* cVector, const double* aVector, const
24			* double* bVector, unsigned int num_points) \endcode
25			*
26			* \b Inputs
27			* \li aVector: First input vector.
28			* \li bVector: Second input vector.
29			* \li num_points: The number of values in both input vectors.
30			*
31			* \b Outputs
32			* \li cVector: The output vector.
33			*
34			* \b Example
35			* add elements of an increasing vector by those of a decreasing vector.
36			* \code
37			* int N = 10;
38			* unsigned int alignment = volk_get_alignment();
39			* float* increasing = (float)volk_malloc(sizeof(float)N, alignment);
40			* double* decreasing = (double)volk_malloc(sizeof(double)N, alignment);
41			* double* out = (double)volk_malloc(sizeof(double)N, alignment);
42			*
43			* for(unsigned int ii = 0; ii < N; ++ii){
44			* increasing[ii] = (double)ii;
45			* decreasing[ii] = 10.f - (double)ii;
46			* }
47			*
48			* volk_32f_64f_add_64f(out, increasing, decreasing, N);
49			*
50			* for(unsigned int ii = 0; ii < N; ++ii){
51			* printf("out[%u] = %1.2F\n", ii, out[ii]);
52			* }
53			*
54			* volk_free(increasing);
55			* volk_free(decreasing);
56			* volk_free(out);
57			* \endcode
58			*/
59
60			#ifndef INCLUDED_volk_32f_64f_add_64f_H
61			#define INCLUDED_volk_32f_64f_add_64f_H
62
63			#include <inttypes.h>
64
65			#ifdef LV_HAVE_GENERIC
66
67		2	static inline void volk_32f_64f_add_64f_generic(double* cVector,
68			const float* aVector,
69			const double* bVector,
70			unsigned int num_points)
71			{
72		2	double* cPtr = cVector;
73		2	const float* aPtr = aVector;
74		2	const double* bPtr = bVector;
75		2	unsigned int number = 0;
76
77	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
78		262142	cPtr++ = ((double)(aPtr++)) + (*bPtr++);
79			}
80		2	}
81
82			#endif /* LV_HAVE_GENERIC */
83
84			#ifdef LV_HAVE_NEONV8
85			#include <arm_neon.h>
86
87			static inline void volk_32f_64f_add_64f_neon(double* cVector,
88			const float* aVector,
89			const double* bVector,
90			unsigned int num_points)
91			{
92			unsigned int number = 0;
93			const unsigned int half_points = num_points / 2;
94
95			double* cPtr = cVector;
96			const float* aPtr = aVector;
97			const double* bPtr = bVector;
98
99			float64x2_t aVal, bVal, cVal;
100			float32x2_t aVal1;
101			for (number = 0; number < half_points; number++) {
102			// Load in to NEON registers
103			aVal1 = vld1_f32(aPtr);
104			bVal = vld1q_f64(bPtr);
105			__VOLK_PREFETCH(aPtr + 2);
106			__VOLK_PREFETCH(bPtr + 2);
107			aPtr += 2; // q uses quadwords, 4 floats per vadd
108			bPtr += 2;
109
110			// Vector conversion
111			aVal = vcvt_f64_f32(aVal1);
112			// vector add
113			cVal = vaddq_f64(aVal, bVal);
114			// Store the results back into the C container
115			vst1q_f64(cPtr, cVal);
116
117			cPtr += 2;
118			}
119
120			number = half_points * 2; // should be = num_points
121			for (; number < num_points; number++) {
122			cPtr++ = ((double)(aPtr++)) + (*bPtr++);
123			}
124			}
125
126			#endif /* LV_HAVE_NEONV8 */
127
128			#ifdef LV_HAVE_AVX
129
130			#include <immintrin.h>
131			#include <xmmintrin.h>
132
133		2	static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
134			const float* aVector,
135			const double* bVector,
136			unsigned int num_points)
137			{
138		2	unsigned int number = 0;
139		2	const unsigned int eighth_points = num_points / 8;
140
141		2	double* cPtr = cVector;
142		2	const float* aPtr = aVector;
143		2	const double* bPtr = bVector;
144
145			__m256 aVal;
146			__m128 aVal1, aVal2;
147			__m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
148	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighth_points; number++) {
149
150		32766	aVal = _mm256_loadu_ps(aPtr);
151		32766	bVal1 = _mm256_loadu_pd(bPtr);
152		32766	bVal2 = _mm256_loadu_pd(bPtr + 4);
153
154		32766	aVal1 = _mm256_extractf128_ps(aVal, 0);
155		32766	aVal2 = _mm256_extractf128_ps(aVal, 1);
156
157		32766	aDbl1 = _mm256_cvtps_pd(aVal1);
158		32766	aDbl2 = _mm256_cvtps_pd(aVal2);
159
160		32766	cVal1 = _mm256_add_pd(aDbl1, bVal1);
161		32766	cVal2 = _mm256_add_pd(aDbl2, bVal2);
162
163			_mm256_storeu_pd(cPtr,
164			cVal1); // Store the results back into the C container
165		32766	_mm256_storeu_pd(cPtr + 4,
166			cVal2); // Store the results back into the C container
167
168		32766	aPtr += 8;
169		32766	bPtr += 8;
170		32766	cPtr += 8;
171			}
172
173		2	number = eighth_points * 8;
174	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
175		14	cPtr++ = ((double)(aPtr++)) + (*bPtr++);
176			}
177		2	}
178
179			#endif /* LV_HAVE_AVX */
180
181			#ifdef LV_HAVE_AVX
182
183			#include <immintrin.h>
184			#include <xmmintrin.h>
185
186		2	static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
187			const float* aVector,
188			const double* bVector,
189			unsigned int num_points)
190			{
191		2	unsigned int number = 0;
192		2	const unsigned int eighth_points = num_points / 8;
193
194		2	double* cPtr = cVector;
195		2	const float* aPtr = aVector;
196		2	const double* bPtr = bVector;
197
198			__m256 aVal;
199			__m128 aVal1, aVal2;
200			__m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
201	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighth_points; number++) {
202
203		32766	aVal = _mm256_load_ps(aPtr);
204		32766	bVal1 = _mm256_load_pd(bPtr);
205		32766	bVal2 = _mm256_load_pd(bPtr + 4);
206
207		32766	aVal1 = _mm256_extractf128_ps(aVal, 0);
208		32766	aVal2 = _mm256_extractf128_ps(aVal, 1);
209
210		32766	aDbl1 = _mm256_cvtps_pd(aVal1);
211		32766	aDbl2 = _mm256_cvtps_pd(aVal2);
212
213		32766	cVal1 = _mm256_add_pd(aDbl1, bVal1);
214		32766	cVal2 = _mm256_add_pd(aDbl2, bVal2);
215
216			_mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
217		32766	_mm256_store_pd(cPtr + 4,
218			cVal2); // Store the results back into the C container
219
220		32766	aPtr += 8;
221		32766	bPtr += 8;
222		32766	cPtr += 8;
223			}
224
225		2	number = eighth_points * 8;
226	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
227		14	cPtr++ = ((double)(aPtr++)) + (*bPtr++);
228			}
229		2	}
230
231			#endif /* LV_HAVE_AVX */
232
233			#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
234