GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_x2_add_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	87	121	71.9%
Functions:	7	9	77.8%
Branches:	20	28	71.4%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_x2_add_32f
    
       *
    
       * \b Overview
    
       *
    
       * Adds two vectors together element by element:
    
       *
    
       * c[i] = a[i] + b[i]
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
    
       * unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: First vector of input points.
    
       * \li bVector: Second vector of input points.
    
       * \li num_points: The number of values in both input vector.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output vector.
    
       *
    
       * \b Example
    
       *
    
       * The follow example adds the increasing and decreasing vectors such that the result of
    
       * every summation pair is 10
    
       *
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = (float)ii;
    
       *       decreasing[ii] = 10.f - (float)ii;
    
       *   }
    
       *
    
       *   volk_32f_x2_add_32f(out, increasing, decreasing, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %1.2f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(decreasing);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
    
      #define INCLUDED_volk_32f_x2_add_32f_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX512F
    
      #include <immintrin.h>
    
      ✗
      static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
    
                                                       const float* aVector,
    
                                                       const float* bVector,
    
                                                       unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int sixteenthPoints = num_points / 16;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
      ✗
          const float* bPtr = bVector;
    
          __m512 aVal, bVal, cVal;
    
      ✗
          for (; number < sixteenthPoints; number++) {
    
      ✗
              aVal = _mm512_loadu_ps(aPtr);
    
      ✗
              bVal = _mm512_loadu_ps(bPtr);
    
      ✗
              cVal = _mm512_add_ps(aVal, bVal);
    
              _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      ✗
              aPtr += 16;
    
      ✗
              bPtr += 16;
    
      ✗
              cPtr += 16;
    
          }
    
      ✗
          number = sixteenthPoints * 16;
    
      ✗
          for (; number < num_points; number++) {
    
      ✗
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      ✗
      }
    
      #endif /* LV_HAVE_AVX512F */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
    
                                                   const float* aVector,
    
                                                   const float* bVector,
    
                                                   unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m256 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              bVal = _mm256_loadu_ps(bPtr);
    
      32766
              cVal = _mm256_add_ps(aVal, bVal);
    
              _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
    
                                                   const float* aVector,
    
                                                   const float* bVector,
    
                                                   unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m128 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      65534
              bVal = _mm_loadu_ps(bPtr);
    
      65534
              cVal = _mm_add_ps(aVal, bVal);
    
              _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_x2_add_32f_generic(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float* bVector,
    
                                                     unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
    
      #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
    
      #define INCLUDED_volk_32f_x2_add_32f_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX512F
    
      #include <immintrin.h>
    
      ✗
      static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
    
                                                       const float* aVector,
    
                                                       const float* bVector,
    
                                                       unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int sixteenthPoints = num_points / 16;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
      ✗
          const float* bPtr = bVector;
    
          __m512 aVal, bVal, cVal;
    
      ✗
          for (; number < sixteenthPoints; number++) {
    
      ✗
              aVal = _mm512_load_ps(aPtr);
    
      ✗
              bVal = _mm512_load_ps(bPtr);
    
      ✗
              cVal = _mm512_add_ps(aVal, bVal);
    
              _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
    
      ✗
              aPtr += 16;
    
      ✗
              bPtr += 16;
    
      ✗
              cPtr += 16;
    
          }
    
      ✗
          number = sixteenthPoints * 16;
    
      ✗
          for (; number < num_points; number++) {
    
      ✗
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      ✗
      }
    
      #endif /* LV_HAVE_AVX512F */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
    
                                                   const float* aVector,
    
                                                   const float* bVector,
    
                                                   unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m256 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              bVal = _mm256_load_ps(bPtr);
    
      32766
              cVal = _mm256_add_ps(aVal, bVal);
    
              _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
    
                                                   const float* aVector,
    
                                                   const float* bVector,
    
                                                   unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m128 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              bVal = _mm_load_ps(bPtr);
    
      65534
              cVal = _mm_add_ps(aVal, bVal);
    
              _mm_store_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
    
                                                    const float* aVector,
    
                                                    const float* bVector,
    
                                                    unsigned int num_points)
    
      {
    
          unsigned int number = 0;
    
          const unsigned int quarterPoints = num_points / 4;
    
          float* cPtr = cVector;
    
          const float* aPtr = aVector;
    
          const float* bPtr = bVector;
    
          float32x4_t aVal, bVal, cVal;
    
          for (number = 0; number < quarterPoints; number++) {
    
              // Load in to NEON registers
    
              aVal = vld1q_f32(aPtr);
    
              bVal = vld1q_f32(bPtr);
    
              __VOLK_PREFETCH(aPtr + 4);
    
              __VOLK_PREFETCH(bPtr + 4);
    
              // vector add
    
              cVal = vaddq_f32(aVal, bVal);
    
              // Store the results back into the C container
    
              vst1q_f32(cPtr, cVal);
    
              aPtr += 4; // q uses quadwords, 4 floats per vadd
    
              bPtr += 4;
    
              cPtr += 4;
    
          }
    
          number = quarterPoints * 4; // should be = num_points
    
          for (; number < num_points; number++) {
    
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_NEONV7
    
      extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
    
                                                const float* aVector,
    
                                                const float* bVector,
    
                                                unsigned int num_points);
    
      #endif /* LV_HAVE_NEONV7 */
    
      #ifdef LV_HAVE_NEONV7
    
      extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float* bVector,
    
                                                     unsigned int num_points);
    
      #endif /* LV_HAVE_NEONV7 */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
    
                                                       const float* aVector,
    
                                                       const float* bVector,
    
                                                       unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) + (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
    
                                                 const float* aVector,
    
                                                 const float* bVector,
    
                                                 unsigned int num_points);
    
      2
      static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
    
                                                   const float* aVector,
    
                                                   const float* bVector,
    
                                                   unsigned int num_points)
    
      {
    
      2
          volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_ORC */
    
      #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_x2_add_32f
12			*
13			* \b Overview
14			*
15			* Adds two vectors together element by element:
16			*
17			* c[i] = a[i] + b[i]
18			*
19			* <b>Dispatcher Prototype</b>
20			* \code
21			* void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
22			* unsigned int num_points) \endcode
23			*
24			* \b Inputs
25			* \li aVector: First vector of input points.
26			* \li bVector: Second vector of input points.
27			* \li num_points: The number of values in both input vector.
28			*
29			* \b Outputs
30			* \li cVector: The output vector.
31			*
32			* \b Example
33			*
34			* The follow example adds the increasing and decreasing vectors such that the result of
35			* every summation pair is 10
36			*
37			* \code
38			* int N = 10;
39			* unsigned int alignment = volk_get_alignment();
40			* float* increasing = (float)volk_malloc(sizeof(float)N, alignment);
41			* float* decreasing = (float)volk_malloc(sizeof(float)N, alignment);
42			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
43			*
44			* for(unsigned int ii = 0; ii < N; ++ii){
45			* increasing[ii] = (float)ii;
46			* decreasing[ii] = 10.f - (float)ii;
47			* }
48			*
49			* volk_32f_x2_add_32f(out, increasing, decreasing, N);
50			*
51			* for(unsigned int ii = 0; ii < N; ++ii){
52			* printf("out[%u] = %1.2f\n", ii, out[ii]);
53			* }
54			*
55			* volk_free(increasing);
56			* volk_free(decreasing);
57			* volk_free(out);
58			* \endcode
59			*/
60
61			#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62			#define INCLUDED_volk_32f_x2_add_32f_u_H
63
64			#include <inttypes.h>
65			#include <stdio.h>
66
67			#ifdef LV_HAVE_AVX512F
68			#include <immintrin.h>
69
70		✗	static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
71			const float* aVector,
72			const float* bVector,
73			unsigned int num_points)
74			{
75		✗	unsigned int number = 0;
76		✗	const unsigned int sixteenthPoints = num_points / 16;
77
78		✗	float* cPtr = cVector;
79		✗	const float* aPtr = aVector;
80		✗	const float* bPtr = bVector;
81
82			__m512 aVal, bVal, cVal;
83		✗	for (; number < sixteenthPoints; number++) {
84
85		✗	aVal = _mm512_loadu_ps(aPtr);
86		✗	bVal = _mm512_loadu_ps(bPtr);
87
88		✗	cVal = _mm512_add_ps(aVal, bVal);
89
90			_mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
91
92		✗	aPtr += 16;
93		✗	bPtr += 16;
94		✗	cPtr += 16;
95			}
96
97		✗	number = sixteenthPoints * 16;
98
99		✗	for (; number < num_points; number++) {
100		✗	cPtr++ = (aPtr++) + (*bPtr++);
101			}
102		✗	}
103
104			#endif /* LV_HAVE_AVX512F */
105
106
107			#ifdef LV_HAVE_AVX
108			#include <immintrin.h>
109
110		2	static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
111			const float* aVector,
112			const float* bVector,
113			unsigned int num_points)
114			{
115		2	unsigned int number = 0;
116		2	const unsigned int eighthPoints = num_points / 8;
117		2	float* cPtr = cVector;
118		2	const float* aPtr = aVector;
119		2	const float* bPtr = bVector;
120			__m256 aVal, bVal, cVal;
121	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
122
123		32766	aVal = _mm256_loadu_ps(aPtr);
124		32766	bVal = _mm256_loadu_ps(bPtr);
125
126		32766	cVal = _mm256_add_ps(aVal, bVal);
127
128			_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
129
130		32766	aPtr += 8;
131		32766	bPtr += 8;
132		32766	cPtr += 8;
133			}
134
135		2	number = eighthPoints * 8;
136
137	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
138		14	cPtr++ = (aPtr++) + (*bPtr++);
139			}
140		2	}
141			#endif /* LV_HAVE_AVX */
142
143
144			#ifdef LV_HAVE_SSE
145			#include <xmmintrin.h>
146
147		2	static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
148			const float* aVector,
149			const float* bVector,
150			unsigned int num_points)
151			{
152		2	unsigned int number = 0;
153		2	const unsigned int quarterPoints = num_points / 4;
154
155		2	float* cPtr = cVector;
156		2	const float* aPtr = aVector;
157		2	const float* bPtr = bVector;
158
159			__m128 aVal, bVal, cVal;
160	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
161
162		65534	aVal = _mm_loadu_ps(aPtr);
163		65534	bVal = _mm_loadu_ps(bPtr);
164
165		65534	cVal = _mm_add_ps(aVal, bVal);
166
167			_mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
168
169		65534	aPtr += 4;
170		65534	bPtr += 4;
171		65534	cPtr += 4;
172			}
173
174		2	number = quarterPoints * 4;
175	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
176		6	cPtr++ = (aPtr++) + (*bPtr++);
177			}
178		2	}
179			#endif /* LV_HAVE_SSE */
180
181
182			#ifdef LV_HAVE_GENERIC
183
184		2	static inline void volk_32f_x2_add_32f_generic(float* cVector,
185			const float* aVector,
186			const float* bVector,
187			unsigned int num_points)
188			{
189		2	float* cPtr = cVector;
190		2	const float* aPtr = aVector;
191		2	const float* bPtr = bVector;
192		2	unsigned int number = 0;
193
194	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
195		262142	cPtr++ = (aPtr++) + (*bPtr++);
196			}
197		2	}
198			#endif /* LV_HAVE_GENERIC */
199
200
201			#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
202			#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203			#define INCLUDED_volk_32f_x2_add_32f_a_H
204
205			#include <inttypes.h>
206			#include <stdio.h>
207
208			#ifdef LV_HAVE_AVX512F
209			#include <immintrin.h>
210
211		✗	static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
212			const float* aVector,
213			const float* bVector,
214			unsigned int num_points)
215			{
216		✗	unsigned int number = 0;
217		✗	const unsigned int sixteenthPoints = num_points / 16;
218
219		✗	float* cPtr = cVector;
220		✗	const float* aPtr = aVector;
221		✗	const float* bPtr = bVector;
222
223			__m512 aVal, bVal, cVal;
224		✗	for (; number < sixteenthPoints; number++) {
225
226		✗	aVal = _mm512_load_ps(aPtr);
227		✗	bVal = _mm512_load_ps(bPtr);
228
229		✗	cVal = _mm512_add_ps(aVal, bVal);
230
231			_mm512_store_ps(cPtr, cVal); // Store the results back into the C container
232
233		✗	aPtr += 16;
234		✗	bPtr += 16;
235		✗	cPtr += 16;
236			}
237
238		✗	number = sixteenthPoints * 16;
239
240		✗	for (; number < num_points; number++) {
241		✗	cPtr++ = (aPtr++) + (*bPtr++);
242			}
243		✗	}
244
245			#endif /* LV_HAVE_AVX512F */
246
247
248			#ifdef LV_HAVE_AVX
249			#include <immintrin.h>
250
251		2	static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
252			const float* aVector,
253			const float* bVector,
254			unsigned int num_points)
255			{
256		2	unsigned int number = 0;
257		2	const unsigned int eighthPoints = num_points / 8;
258
259		2	float* cPtr = cVector;
260		2	const float* aPtr = aVector;
261		2	const float* bPtr = bVector;
262
263			__m256 aVal, bVal, cVal;
264	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
265
266		32766	aVal = _mm256_load_ps(aPtr);
267		32766	bVal = _mm256_load_ps(bPtr);
268
269		32766	cVal = _mm256_add_ps(aVal, bVal);
270
271			_mm256_store_ps(cPtr, cVal); // Store the results back into the C container
272
273		32766	aPtr += 8;
274		32766	bPtr += 8;
275		32766	cPtr += 8;
276			}
277
278		2	number = eighthPoints * 8;
279	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
280		14	cPtr++ = (aPtr++) + (*bPtr++);
281			}
282		2	}
283			#endif /* LV_HAVE_AVX */
284
285			#ifdef LV_HAVE_SSE
286			#include <xmmintrin.h>
287
288		2	static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
289			const float* aVector,
290			const float* bVector,
291			unsigned int num_points)
292			{
293		2	unsigned int number = 0;
294		2	const unsigned int quarterPoints = num_points / 4;
295
296		2	float* cPtr = cVector;
297		2	const float* aPtr = aVector;
298		2	const float* bPtr = bVector;
299
300			__m128 aVal, bVal, cVal;
301	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
302		65534	aVal = _mm_load_ps(aPtr);
303		65534	bVal = _mm_load_ps(bPtr);
304
305		65534	cVal = _mm_add_ps(aVal, bVal);
306
307			_mm_store_ps(cPtr, cVal); // Store the results back into the C container
308
309		65534	aPtr += 4;
310		65534	bPtr += 4;
311		65534	cPtr += 4;
312			}
313
314		2	number = quarterPoints * 4;
315	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
316		6	cPtr++ = (aPtr++) + (*bPtr++);
317			}
318		2	}
319			#endif /* LV_HAVE_SSE */
320
321
322			#ifdef LV_HAVE_NEON
323			#include <arm_neon.h>
324
325			static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
326			const float* aVector,
327			const float* bVector,
328			unsigned int num_points)
329			{
330			unsigned int number = 0;
331			const unsigned int quarterPoints = num_points / 4;
332
333			float* cPtr = cVector;
334			const float* aPtr = aVector;
335			const float* bPtr = bVector;
336			float32x4_t aVal, bVal, cVal;
337			for (number = 0; number < quarterPoints; number++) {
338			// Load in to NEON registers
339			aVal = vld1q_f32(aPtr);
340			bVal = vld1q_f32(bPtr);
341			__VOLK_PREFETCH(aPtr + 4);
342			__VOLK_PREFETCH(bPtr + 4);
343
344			// vector add
345			cVal = vaddq_f32(aVal, bVal);
346			// Store the results back into the C container
347			vst1q_f32(cPtr, cVal);
348
349			aPtr += 4; // q uses quadwords, 4 floats per vadd
350			bPtr += 4;
351			cPtr += 4;
352			}
353
354			number = quarterPoints * 4; // should be = num_points
355			for (; number < num_points; number++) {
356			cPtr++ = (aPtr++) + (*bPtr++);
357			}
358			}
359
360			#endif /* LV_HAVE_NEON */
361
362			#ifdef LV_HAVE_NEONV7
363			extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
364			const float* aVector,
365			const float* bVector,
366			unsigned int num_points);
367			#endif /* LV_HAVE_NEONV7 */
368
369			#ifdef LV_HAVE_NEONV7
370			extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
371			const float* aVector,
372			const float* bVector,
373			unsigned int num_points);
374			#endif /* LV_HAVE_NEONV7 */
375
376			#ifdef LV_HAVE_GENERIC
377
378		2	static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
379			const float* aVector,
380			const float* bVector,
381			unsigned int num_points)
382			{
383		2	float* cPtr = cVector;
384		2	const float* aPtr = aVector;
385		2	const float* bPtr = bVector;
386		2	unsigned int number = 0;
387
388	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
389		262142	cPtr++ = (aPtr++) + (*bPtr++);
390			}
391		2	}
392			#endif /* LV_HAVE_GENERIC */
393
394
395			#ifdef LV_HAVE_ORC
396
397			extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
398			const float* aVector,
399			const float* bVector,
400			unsigned int num_points);
401
402		2	static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
403			const float* aVector,
404			const float* bVector,
405			unsigned int num_points)
406			{
407		2	volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
408		2	}
409
410			#endif /* LV_HAVE_ORC */
411
412
413			#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
414