GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_sqrt_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	49	49	100.0%
Functions:	4	4	100.0%
Branches:	14	14	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_sqrt_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes the square root of the input vector and stores the results
    
       * in the output vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_sqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li bVector: The output vector.
    
       *
    
       * \b Example
    
       * \code
    
          int N = 10;
    
          unsigned int alignment = volk_get_alignment();
    
          float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
          float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
          for(unsigned int ii = 0; ii < N; ++ii){
    
              in[ii] = (float)(ii*ii);
    
          }
    
          volk_32f_sqrt_32f(out, in, N);
    
          for(unsigned int ii = 0; ii < N; ++ii){
    
              printf("out(%i) = %f\n", ii, out[ii]);
    
          }
    
          volk_free(in);
    
          volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
    
      #define INCLUDED_volk_32f_sqrt_32f_a_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      static inline void
    
      2
      volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m128 aVal, cVal;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              cVal = _mm_sqrt_ps(aVal);
    
              _mm_store_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = sqrtf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, cVal;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              cVal = _mm256_sqrt_ps(aVal);
    
              _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = sqrtf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void
    
      volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
          float* cPtr = cVector;
    
          const float* aPtr = aVector;
    
          unsigned int number = 0;
    
          unsigned int quarter_points = num_points / 4;
    
          float32x4_t in_vec, out_vec;
    
          for (number = 0; number < quarter_points; number++) {
    
              in_vec = vld1q_f32(aPtr);
    
              // note that armv8 has vsqrt_f32 which will be much better
    
              out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
    
              vst1q_f32(cPtr, out_vec);
    
              aPtr += 4;
    
              cPtr += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *cPtr++ = sqrtf(*aPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = sqrtf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
    
      #define INCLUDED_volk_32f_sqrt_32f_u_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
          __m256 aVal, cVal;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              cVal = _mm256_sqrt_ps(aVal);
    
              _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = sqrtf(*aPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_sqrt_32f
12			*
13			* \b Overview
14			*
15			* Computes the square root of the input vector and stores the results
16			* in the output vector.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32f_sqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
21			* \endcode
22			*
23			* \b Inputs
24			* \li aVector: The input vector of floats.
25			* \li num_points: The number of data points.
26			*
27			* \b Outputs
28			* \li bVector: The output vector.
29			*
30			* \b Example
31			* \code
32			int N = 10;
33			unsigned int alignment = volk_get_alignment();
34			float* in = (float)volk_malloc(sizeof(float)N, alignment);
35			float* out = (float)volk_malloc(sizeof(float)N, alignment);
36
37			for(unsigned int ii = 0; ii < N; ++ii){
38			in[ii] = (float)(ii*ii);
39			}
40
41			volk_32f_sqrt_32f(out, in, N);
42
43			for(unsigned int ii = 0; ii < N; ++ii){
44			printf("out(%i) = %f\n", ii, out[ii]);
45			}
46
47			volk_free(in);
48			volk_free(out);
49			* \endcode
50			*/
51
52			#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53			#define INCLUDED_volk_32f_sqrt_32f_a_H
54
55			#include <inttypes.h>
56			#include <math.h>
57			#include <stdio.h>
58
59			#ifdef LV_HAVE_SSE
60			#include <xmmintrin.h>
61
62			static inline void
63		2	volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
64			{
65		2	unsigned int number = 0;
66		2	const unsigned int quarterPoints = num_points / 4;
67
68		2	float* cPtr = cVector;
69		2	const float* aPtr = aVector;
70
71			__m128 aVal, cVal;
72	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
73		65534	aVal = _mm_load_ps(aPtr);
74
75		65534	cVal = _mm_sqrt_ps(aVal);
76
77			_mm_store_ps(cPtr, cVal); // Store the results back into the C container
78
79		65534	aPtr += 4;
80		65534	cPtr += 4;
81			}
82
83		2	number = quarterPoints * 4;
84	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
85		6	cPtr++ = sqrtf(aPtr++);
86			}
87		2	}
88
89			#endif /* LV_HAVE_SSE */
90
91			#ifdef LV_HAVE_AVX
92			#include <immintrin.h>
93
94			static inline void
95		2	volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
96			{
97		2	unsigned int number = 0;
98		2	const unsigned int eighthPoints = num_points / 8;
99
100		2	float* cPtr = cVector;
101		2	const float* aPtr = aVector;
102
103			__m256 aVal, cVal;
104	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
105		32766	aVal = _mm256_load_ps(aPtr);
106
107		32766	cVal = _mm256_sqrt_ps(aVal);
108
109			_mm256_store_ps(cPtr, cVal); // Store the results back into the C container
110
111		32766	aPtr += 8;
112		32766	cPtr += 8;
113			}
114
115		2	number = eighthPoints * 8;
116	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
117		14	cPtr++ = sqrtf(aPtr++);
118			}
119		2	}
120
121			#endif /* LV_HAVE_AVX */
122
123
124			#ifdef LV_HAVE_NEON
125			#include <arm_neon.h>
126
127			static inline void
128			volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
129			{
130			float* cPtr = cVector;
131			const float* aPtr = aVector;
132			unsigned int number = 0;
133			unsigned int quarter_points = num_points / 4;
134			float32x4_t in_vec, out_vec;
135
136			for (number = 0; number < quarter_points; number++) {
137			in_vec = vld1q_f32(aPtr);
138			// note that armv8 has vsqrt_f32 which will be much better
139			out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140			vst1q_f32(cPtr, out_vec);
141			aPtr += 4;
142			cPtr += 4;
143			}
144
145			for (number = quarter_points * 4; number < num_points; number++) {
146			cPtr++ = sqrtf(aPtr++);
147			}
148			}
149
150			#endif /* LV_HAVE_NEON */
151
152
153			#ifdef LV_HAVE_GENERIC
154
155			static inline void
156		2	volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
157			{
158		2	float* cPtr = cVector;
159		2	const float* aPtr = aVector;
160		2	unsigned int number = 0;
161
162	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
163		262142	cPtr++ = sqrtf(aPtr++);
164			}
165		2	}
166
167			#endif /* LV_HAVE_GENERIC */
168
169			#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
170
171			#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
172			#define INCLUDED_volk_32f_sqrt_32f_u_H
173
174			#include <inttypes.h>
175			#include <math.h>
176			#include <stdio.h>
177			#ifdef LV_HAVE_AVX
178			#include <immintrin.h>
179
180			static inline void
181		2	volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
182			{
183		2	unsigned int number = 0;
184		2	const unsigned int eighthPoints = num_points / 8;
185
186		2	float* cPtr = cVector;
187		2	const float* aPtr = aVector;
188
189			__m256 aVal, cVal;
190	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
191		32766	aVal = _mm256_loadu_ps(aPtr);
192
193		32766	cVal = _mm256_sqrt_ps(aVal);
194
195			_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
196
197		32766	aPtr += 8;
198		32766	cPtr += 8;
199			}
200
201		2	number = eighthPoints * 8;
202	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
203		14	cPtr++ = sqrtf(aPtr++);
204			}
205		2	}
206
207			#endif /* LV_HAVE_AVX */
208			#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
209