GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_invsqrt_32f.h
Date:	2023-10-23 23:10:04

	Total	Coverage
Lines:	56	0.0%
Functions:	5	0.0%
Branches:	14	0.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2013, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_invsqrt_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes the inverse square root of the input vector and stores
    
       * result in the output vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
    
       * num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: the input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output vector.
    
       *
    
       * \b Example
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       in[ii] = 1.0 / (float)(ii*ii);
    
       *   }
    
       *
    
       *   volk_32f_invsqrt_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out(%i) = %f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
    
      #define INCLUDED_volk_32f_invsqrt_32f_a_H
    
      #include <inttypes.h>
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <string.h>
    
      ✗
      static inline float Q_rsqrt(float number)
    
      {
    
          float x2;
    
      ✗
          const float threehalfs = 1.5F;
    
          union f32_to_i32 {
    
              int32_t i;
    
              float f;
    
          } u;
    
      ✗
          x2 = number * 0.5F;
    
      ✗
          u.f = number;
    
      ✗
          u.i = 0x5f3759df - (u.i >> 1);               // what the fuck?
    
      ✗
          u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
    
          // u.f  = u.f * ( threehalfs - ( x2 * u.f * u.f ) );   // 2nd iteration, this can be
    
          // removed
    
      ✗
          return u.f;
    
      }
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      ✗
      volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int eighthPoints = num_points / 8;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
          __m256 aVal, cVal;
    
      ✗
          for (; number < eighthPoints; number++) {
    
      ✗
              aVal = _mm256_load_ps(aPtr);
    
      ✗
              cVal = _mm256_rsqrt_ps(aVal);
    
              _mm256_store_ps(cPtr, cVal);
    
      ✗
              aPtr += 8;
    
      ✗
              cPtr += 8;
    
          }
    
      ✗
          number = eighthPoints * 8;
    
      ✗
          for (; number < num_points; number++)
    
      ✗
              *cPtr++ = Q_rsqrt(*aPtr++);
    
      ✗
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      static inline void
    
      ✗
      volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int quarterPoints = num_points / 4;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
          __m128 aVal, cVal;
    
      ✗
          for (; number < quarterPoints; number++) {
    
      ✗
              aVal = _mm_load_ps(aPtr);
    
      ✗
              cVal = _mm_rsqrt_ps(aVal);
    
              _mm_store_ps(cPtr, cVal); // Store the results back into the C container
    
      ✗
              aPtr += 4;
    
      ✗
              cPtr += 4;
    
          }
    
      ✗
          number = quarterPoints * 4;
    
      ✗
          for (; number < num_points; number++) {
    
      ✗
              *cPtr++ = Q_rsqrt(*aPtr++);
    
          }
    
      ✗
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void
    
      volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
          unsigned int number;
    
          const unsigned int quarter_points = num_points / 4;
    
          float* cPtr = cVector;
    
          const float* aPtr = aVector;
    
          float32x4_t a_val, c_val;
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld1q_f32(aPtr);
    
              c_val = vrsqrteq_f32(a_val);
    
              vst1q_f32(cPtr, c_val);
    
              aPtr += 4;
    
              cPtr += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++)
    
              *cPtr++ = Q_rsqrt(*aPtr++);
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_GENERIC
    
      ✗
      static inline void volk_32f_invsqrt_32f_generic(float* cVector,
    
                                                      const float* aVector,
    
                                                      unsigned int num_points)
    
      {
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
      ✗
          unsigned int number = 0;
    
      ✗
          for (number = 0; number < num_points; number++) {
    
      ✗
              *cPtr++ = Q_rsqrt(*aPtr++);
    
          }
    
      ✗
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      ✗
      volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int eighthPoints = num_points / 8;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
          __m256 aVal, cVal;
    
      ✗
          for (; number < eighthPoints; number++) {
    
      ✗
              aVal = _mm256_loadu_ps(aPtr);
    
      ✗
              cVal = _mm256_rsqrt_ps(aVal);
    
              _mm256_storeu_ps(cPtr, cVal);
    
      ✗
              aPtr += 8;
    
      ✗
              cPtr += 8;
    
          }
    
      ✗
          number = eighthPoints * 8;
    
      ✗
          for (; number < num_points; number++)
    
      ✗
              *cPtr++ = Q_rsqrt(*aPtr++);
    
      ✗
      }
    
      #endif /* LV_HAVE_AVX */
    
      #endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */

Line	Exec	Source
1		/* -- c++ -- */
2		/*
3		* Copyright 2013, 2014 Free Software Foundation, Inc.
4		*
5		* This file is part of VOLK
6		*
7		* SPDX-License-Identifier: LGPL-3.0-or-later
8		*/
9
10		/*!
11		* \page volk_32f_invsqrt_32f
12		*
13		* \b Overview
14		*
15		* Computes the inverse square root of the input vector and stores
16		* result in the output vector.
17		*
18		* <b>Dispatcher Prototype</b>
19		* \code
20		* void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
21		* num_points) \endcode
22		*
23		* \b Inputs
24		* \li aVector: the input vector of floats.
25		* \li num_points: The number of data points.
26		*
27		* \b Outputs
28		* \li cVector: The output vector.
29		*
30		* \b Example
31		* \code
32		* int N = 10;
33		* unsigned int alignment = volk_get_alignment();
34		* float* in = (float)volk_malloc(sizeof(float)N, alignment);
35		* float* out = (float)volk_malloc(sizeof(float)N, alignment);
36		*
37		* for(unsigned int ii = 0; ii < N; ++ii){
38		* in[ii] = 1.0 / (float)(ii*ii);
39		* }
40		*
41		* volk_32f_invsqrt_32f(out, in, N);
42		*
43		* for(unsigned int ii = 0; ii < N; ++ii){
44		* printf("out(%i) = %f\n", ii, out[ii]);
45		* }
46		*
47		* volk_free(in);
48		* volk_free(out);
49		* \endcode
50		*/
51
52		#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
53		#define INCLUDED_volk_32f_invsqrt_32f_a_H
54
55		#include <inttypes.h>
56		#include <math.h>
57		#include <stdio.h>
58		#include <string.h>
59
60	✗	static inline float Q_rsqrt(float number)
61		{
62		float x2;
63	✗	const float threehalfs = 1.5F;
64		union f32_to_i32 {
65		int32_t i;
66		float f;
67		} u;
68
69	✗	x2 = number * 0.5F;
70	✗	u.f = number;
71	✗	u.i = 0x5f3759df - (u.i >> 1); // what the fuck?
72	✗	u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
73		// u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be
74		// removed
75
76	✗	return u.f;
77		}
78
79		#ifdef LV_HAVE_AVX
80		#include <immintrin.h>
81
82		static inline void
83	✗	volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
84		{
85	✗	unsigned int number = 0;
86	✗	const unsigned int eighthPoints = num_points / 8;
87
88	✗	float* cPtr = cVector;
89	✗	const float* aPtr = aVector;
90		__m256 aVal, cVal;
91	✗	for (; number < eighthPoints; number++) {
92	✗	aVal = _mm256_load_ps(aPtr);
93	✗	cVal = _mm256_rsqrt_ps(aVal);
94		_mm256_store_ps(cPtr, cVal);
95	✗	aPtr += 8;
96	✗	cPtr += 8;
97		}
98
99	✗	number = eighthPoints * 8;
100	✗	for (; number < num_points; number++)
101	✗	cPtr++ = Q_rsqrt(aPtr++);
102	✗	}
103		#endif /* LV_HAVE_AVX */
104
105
106		#ifdef LV_HAVE_SSE
107		#include <xmmintrin.h>
108
109		static inline void
110	✗	volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
111		{
112	✗	unsigned int number = 0;
113	✗	const unsigned int quarterPoints = num_points / 4;
114
115	✗	float* cPtr = cVector;
116	✗	const float* aPtr = aVector;
117
118		__m128 aVal, cVal;
119	✗	for (; number < quarterPoints; number++) {
120
121	✗	aVal = _mm_load_ps(aPtr);
122
123	✗	cVal = _mm_rsqrt_ps(aVal);
124
125		_mm_store_ps(cPtr, cVal); // Store the results back into the C container
126
127	✗	aPtr += 4;
128	✗	cPtr += 4;
129		}
130
131	✗	number = quarterPoints * 4;
132	✗	for (; number < num_points; number++) {
133	✗	cPtr++ = Q_rsqrt(aPtr++);
134		}
135	✗	}
136		#endif /* LV_HAVE_SSE */
137
138
139		#ifdef LV_HAVE_NEON
140		#include <arm_neon.h>
141
142		static inline void
143		volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
144		{
145		unsigned int number;
146		const unsigned int quarter_points = num_points / 4;
147
148		float* cPtr = cVector;
149		const float* aPtr = aVector;
150		float32x4_t a_val, c_val;
151		for (number = 0; number < quarter_points; ++number) {
152		a_val = vld1q_f32(aPtr);
153		c_val = vrsqrteq_f32(a_val);
154		vst1q_f32(cPtr, c_val);
155		aPtr += 4;
156		cPtr += 4;
157		}
158
159		for (number = quarter_points * 4; number < num_points; number++)
160		cPtr++ = Q_rsqrt(aPtr++);
161		}
162		#endif /* LV_HAVE_NEON */
163
164
165		#ifdef LV_HAVE_GENERIC
166
167	✗	static inline void volk_32f_invsqrt_32f_generic(float* cVector,
168		const float* aVector,
169		unsigned int num_points)
170		{
171	✗	float* cPtr = cVector;
172	✗	const float* aPtr = aVector;
173	✗	unsigned int number = 0;
174	✗	for (number = 0; number < num_points; number++) {
175	✗	cPtr++ = Q_rsqrt(aPtr++);
176		}
177	✗	}
178		#endif /* LV_HAVE_GENERIC */
179
180		#ifdef LV_HAVE_AVX
181		#include <immintrin.h>
182
183		static inline void
184	✗	volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
185		{
186	✗	unsigned int number = 0;
187	✗	const unsigned int eighthPoints = num_points / 8;
188
189	✗	float* cPtr = cVector;
190	✗	const float* aPtr = aVector;
191		__m256 aVal, cVal;
192	✗	for (; number < eighthPoints; number++) {
193	✗	aVal = _mm256_loadu_ps(aPtr);
194	✗	cVal = _mm256_rsqrt_ps(aVal);
195		_mm256_storeu_ps(cPtr, cVal);
196	✗	aPtr += 8;
197	✗	cPtr += 8;
198		}
199
200	✗	number = eighthPoints * 8;
201	✗	for (; number < num_points; number++)
202	✗	cPtr++ = Q_rsqrt(aPtr++);
203	✗	}
204		#endif /* LV_HAVE_AVX */
205
206		#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
207