GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_s32f_add_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	68	68	100.0%
Functions:	6	6	100.0%
Branches:	10	10	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2020 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_s32f_add_32f
    
       *
    
       * \b Overview
    
       *
    
       * Adds a floating point scalar to a floating point vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_s32f_add_32f(float* cVector, const float* aVector, const float scalar,
    
       * unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The input vector of floats.
    
       * \li scalar: the scalar value to add against \p aVector.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output vector of floats.
    
       *
    
       * \b Example
    
       * \code
    
       *  int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
    
       *   }
    
       *
    
       *   // Add addshift to each entry.
    
       *   float addshift = 5.0f;
    
       *
    
       *   volk_32f_s32f_add_32f(out, increasing, addshift, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifndef INCLUDED_volk_32f_s32f_add_32f_u_H
    
      #define INCLUDED_volk_32f_s32f_add_32f_u_H
    
      #ifdef LV_HAVE_GENERIC
    
      10
      static inline void volk_32f_s32f_add_32f_generic(float* cVector,
    
                                                       const float* aVector,
    
                                                       const float scalar,
    
                                                       unsigned int num_points)
    
      {
    
      10
          unsigned int number = 0;
    
      10
          const float* inputPtr = aVector;
    
      10
          float* outputPtr = cVector;
    
        2/2✓ Branch 0 taken 262182 times.
✓ Branch 1 taken 10 times.

      262192
          for (number = 0; number < num_points; number++) {
    
      262182
              *outputPtr = (*inputPtr) + scalar;
    
      262182
              inputPtr++;
    
      262182
              outputPtr++;
    
          }
    
      10
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_add_32f_u_sse(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float scalar,
    
                                                     unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m128 aVal, bVal, cVal;
    
      2
          bVal = _mm_set_ps1(scalar);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_loadu_ps(aPtr);
    
      65534
              cVal = _mm_add_ps(aVal, bVal);
    
              _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_add_32f_u_avx(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float scalar,
    
                                                     unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, bVal, cVal;
    
      2
          bVal = _mm256_set1_ps(scalar);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              cVal = _mm256_add_ps(aVal, bVal);
    
              _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32f_s32f_add_32f_u_neon(float* cVector,
    
                                                      const float* aVector,
    
                                                      const float scalar,
    
                                                      unsigned int num_points)
    
      {
    
          unsigned int number = 0;
    
          const float* inputPtr = aVector;
    
          float* outputPtr = cVector;
    
          const unsigned int quarterPoints = num_points / 4;
    
          float32x4_t aVal, cVal, scalarvec;
    
          scalarvec = vdupq_n_f32(scalar);
    
          for (number = 0; number < quarterPoints; number++) {
    
              aVal = vld1q_f32(inputPtr);        // Load into NEON regs
    
              cVal = vaddq_f32(aVal, scalarvec); // Do the add
    
              vst1q_f32(outputPtr, cVal);        // Store results back to output
    
              inputPtr += 4;
    
              outputPtr += 4;
    
          }
    
          number = quarterPoints * 4;
    
          volk_32f_s32f_add_32f_generic(outputPtr, inputPtr, scalar, num_points - number);
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #endif /* INCLUDED_volk_32f_s32f_add_32f_u_H */
    
      #ifndef INCLUDED_volk_32f_s32f_add_32f_a_H
    
      #define INCLUDED_volk_32f_s32f_add_32f_a_H
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_add_32f_a_sse(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float scalar,
    
                                                     unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m128 aVal, bVal, cVal;
    
      2
          bVal = _mm_set_ps1(scalar);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              cVal = _mm_add_ps(aVal, bVal);
    
              _mm_store_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
      2
          volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_s32f_add_32f_a_avx(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float scalar,
    
                                                     unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, bVal, cVal;
    
      2
          bVal = _mm256_set1_ps(scalar);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              cVal = _mm256_add_ps(aVal, bVal);
    
              _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
      2
          volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_32f_s32f_add_32f_a_orc_impl(float* dst,
    
                                                   const float* src,
    
                                                   const float scalar,
    
                                                   unsigned int num_points);
    
      2
      static inline void volk_32f_s32f_add_32f_u_orc(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float scalar,
    
                                                     unsigned int num_points)
    
      {
    
      2
          volk_32f_s32f_add_32f_a_orc_impl(cVector, aVector, scalar, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_ORC */
    
      #endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2020 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_s32f_add_32f
12			*
13			* \b Overview
14			*
15			* Adds a floating point scalar to a floating point vector.
16			*
17			* <b>Dispatcher Prototype</b>
18			* \code
19			* void volk_32f_s32f_add_32f(float* cVector, const float* aVector, const float scalar,
20			* unsigned int num_points) \endcode
21			*
22			* \b Inputs
23			* \li aVector: The input vector of floats.
24			* \li scalar: the scalar value to add against \p aVector.
25			* \li num_points: The number of data points.
26			*
27			* \b Outputs
28			* \li cVector: The output vector of floats.
29			*
30			* \b Example
31			* \code
32			* int N = 10;
33			* unsigned int alignment = volk_get_alignment();
34			* float* increasing = (float)volk_malloc(sizeof(float)N, alignment);
35			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
36			*
37			* for(unsigned int ii = 0; ii < N; ++ii){
38			* increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
39			* }
40			*
41			* // Add addshift to each entry.
42			* float addshift = 5.0f;
43			*
44			* volk_32f_s32f_add_32f(out, increasing, addshift, N);
45			*
46			* for(unsigned int ii = 0; ii < N; ++ii){
47			* printf("out[%u] = %f\n", ii, out[ii]);
48			* }
49			*
50			* volk_free(increasing);
51			* volk_free(out);
52			* \endcode
53			*/
54
55			#include <inttypes.h>
56			#include <stdio.h>
57
58			#ifndef INCLUDED_volk_32f_s32f_add_32f_u_H
59			#define INCLUDED_volk_32f_s32f_add_32f_u_H
60
61			#ifdef LV_HAVE_GENERIC
62
63		10	static inline void volk_32f_s32f_add_32f_generic(float* cVector,
64			const float* aVector,
65			const float scalar,
66			unsigned int num_points)
67			{
68		10	unsigned int number = 0;
69		10	const float* inputPtr = aVector;
70		10	float* outputPtr = cVector;
71	2/2 ✓ Branch 0 taken 262182 times. ✓ Branch 1 taken 10 times.	262192	for (number = 0; number < num_points; number++) {
72		262182	outputPtr = (inputPtr) + scalar;
73		262182	inputPtr++;
74		262182	outputPtr++;
75			}
76		10	}
77
78			#endif /* LV_HAVE_GENERIC */
79			#ifdef LV_HAVE_SSE
80			#include <xmmintrin.h>
81
82		2	static inline void volk_32f_s32f_add_32f_u_sse(float* cVector,
83			const float* aVector,
84			const float scalar,
85			unsigned int num_points)
86			{
87		2	unsigned int number = 0;
88		2	const unsigned int quarterPoints = num_points / 4;
89
90		2	float* cPtr = cVector;
91		2	const float* aPtr = aVector;
92
93			__m128 aVal, bVal, cVal;
94		2	bVal = _mm_set_ps1(scalar);
95	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
96		65534	aVal = _mm_loadu_ps(aPtr);
97
98		65534	cVal = _mm_add_ps(aVal, bVal);
99
100			_mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
101
102		65534	aPtr += 4;
103		65534	cPtr += 4;
104			}
105
106		2	number = quarterPoints * 4;
107		2	volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
108		2	}
109			#endif /* LV_HAVE_SSE */
110
111			#ifdef LV_HAVE_AVX
112			#include <immintrin.h>
113
114		2	static inline void volk_32f_s32f_add_32f_u_avx(float* cVector,
115			const float* aVector,
116			const float scalar,
117			unsigned int num_points)
118			{
119		2	unsigned int number = 0;
120		2	const unsigned int eighthPoints = num_points / 8;
121
122		2	float* cPtr = cVector;
123		2	const float* aPtr = aVector;
124
125			__m256 aVal, bVal, cVal;
126		2	bVal = _mm256_set1_ps(scalar);
127	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
128
129		32766	aVal = _mm256_loadu_ps(aPtr);
130
131		32766	cVal = _mm256_add_ps(aVal, bVal);
132
133			_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
134
135		32766	aPtr += 8;
136		32766	cPtr += 8;
137			}
138
139		2	number = eighthPoints * 8;
140		2	volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
141		2	}
142			#endif /* LV_HAVE_AVX */
143
144			#ifdef LV_HAVE_NEON
145			#include <arm_neon.h>
146
147			static inline void volk_32f_s32f_add_32f_u_neon(float* cVector,
148			const float* aVector,
149			const float scalar,
150			unsigned int num_points)
151			{
152			unsigned int number = 0;
153			const float* inputPtr = aVector;
154			float* outputPtr = cVector;
155			const unsigned int quarterPoints = num_points / 4;
156
157			float32x4_t aVal, cVal, scalarvec;
158
159			scalarvec = vdupq_n_f32(scalar);
160
161			for (number = 0; number < quarterPoints; number++) {
162			aVal = vld1q_f32(inputPtr); // Load into NEON regs
163			cVal = vaddq_f32(aVal, scalarvec); // Do the add
164			vst1q_f32(outputPtr, cVal); // Store results back to output
165			inputPtr += 4;
166			outputPtr += 4;
167			}
168
169			number = quarterPoints * 4;
170			volk_32f_s32f_add_32f_generic(outputPtr, inputPtr, scalar, num_points - number);
171			}
172			#endif /* LV_HAVE_NEON */
173
174
175			#endif /* INCLUDED_volk_32f_s32f_add_32f_u_H */
176
177
178			#ifndef INCLUDED_volk_32f_s32f_add_32f_a_H
179			#define INCLUDED_volk_32f_s32f_add_32f_a_H
180
181			#ifdef LV_HAVE_SSE
182			#include <xmmintrin.h>
183
184		2	static inline void volk_32f_s32f_add_32f_a_sse(float* cVector,
185			const float* aVector,
186			const float scalar,
187			unsigned int num_points)
188			{
189		2	unsigned int number = 0;
190		2	const unsigned int quarterPoints = num_points / 4;
191
192		2	float* cPtr = cVector;
193		2	const float* aPtr = aVector;
194
195			__m128 aVal, bVal, cVal;
196		2	bVal = _mm_set_ps1(scalar);
197	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
198		65534	aVal = _mm_load_ps(aPtr);
199
200		65534	cVal = _mm_add_ps(aVal, bVal);
201
202			_mm_store_ps(cPtr, cVal); // Store the results back into the C container
203
204		65534	aPtr += 4;
205		65534	cPtr += 4;
206			}
207
208		2	number = quarterPoints * 4;
209		2	volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
210		2	}
211			#endif /* LV_HAVE_SSE */
212
213			#ifdef LV_HAVE_AVX
214			#include <immintrin.h>
215
216		2	static inline void volk_32f_s32f_add_32f_a_avx(float* cVector,
217			const float* aVector,
218			const float scalar,
219			unsigned int num_points)
220			{
221		2	unsigned int number = 0;
222		2	const unsigned int eighthPoints = num_points / 8;
223
224		2	float* cPtr = cVector;
225		2	const float* aPtr = aVector;
226
227			__m256 aVal, bVal, cVal;
228		2	bVal = _mm256_set1_ps(scalar);
229	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
230		32766	aVal = _mm256_load_ps(aPtr);
231
232		32766	cVal = _mm256_add_ps(aVal, bVal);
233
234			_mm256_store_ps(cPtr, cVal); // Store the results back into the C container
235
236		32766	aPtr += 8;
237		32766	cPtr += 8;
238			}
239
240		2	number = eighthPoints * 8;
241		2	volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
242		2	}
243			#endif /* LV_HAVE_AVX */
244
245			#ifdef LV_HAVE_ORC
246
247			extern void volk_32f_s32f_add_32f_a_orc_impl(float* dst,
248			const float* src,
249			const float scalar,
250			unsigned int num_points);
251
252		2	static inline void volk_32f_s32f_add_32f_u_orc(float* cVector,
253			const float* aVector,
254			const float scalar,
255			unsigned int num_points)
256			{
257		2	volk_32f_s32f_add_32f_a_orc_impl(cVector, aVector, scalar, num_points);
258		2	}
259			#endif /* LV_HAVE_ORC */
260
261			#endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */
262