GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_invsqrt_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 0 56 0.0%
Functions: 0 5 0.0%
Branches: 0 14 0.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_invsqrt_32f
12 *
13 * \b Overview
14 *
15 * Computes the inverse square root of the input vector and stores
16 * result in the output vector.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
21 * num_points) \endcode
22 *
23 * \b Inputs
24 * \li aVector: the input vector of floats.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li cVector: The output vector.
29 *
30 * \b Example
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 * for(unsigned int ii = 0; ii < N; ++ii){
38 * in[ii] = 1.0 / (float)(ii*ii);
39 * }
40 *
41 * volk_32f_invsqrt_32f(out, in, N);
42 *
43 * for(unsigned int ii = 0; ii < N; ++ii){
44 * printf("out(%i) = %f\n", ii, out[ii]);
45 * }
46 *
47 * volk_free(in);
48 * volk_free(out);
49 * \endcode
50 */
51
52 #ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
53 #define INCLUDED_volk_32f_invsqrt_32f_a_H
54
55 #include <inttypes.h>
56 #include <math.h>
57 #include <stdio.h>
58 #include <string.h>
59
60 static inline float Q_rsqrt(float number)
61 {
62 float x2;
63 const float threehalfs = 1.5F;
64 union f32_to_i32 {
65 int32_t i;
66 float f;
67 } u;
68
69 x2 = number * 0.5F;
70 u.f = number;
71 u.i = 0x5f3759df - (u.i >> 1); // what the fuck?
72 u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
73 // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be
74 // removed
75
76 return u.f;
77 }
78
79 #ifdef LV_HAVE_AVX
80 #include <immintrin.h>
81
82 static inline void
83 volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
84 {
85 unsigned int number = 0;
86 const unsigned int eighthPoints = num_points / 8;
87
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 __m256 aVal, cVal;
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 cVal = _mm256_rsqrt_ps(aVal);
94 _mm256_store_ps(cPtr, cVal);
95 aPtr += 8;
96 cPtr += 8;
97 }
98
99 number = eighthPoints * 8;
100 for (; number < num_points; number++)
101 *cPtr++ = Q_rsqrt(*aPtr++);
102 }
103 #endif /* LV_HAVE_AVX */
104
105
106 #ifdef LV_HAVE_SSE
107 #include <xmmintrin.h>
108
109 static inline void
110 volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
111 {
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
114
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
117
118 __m128 aVal, cVal;
119 for (; number < quarterPoints; number++) {
120
121 aVal = _mm_load_ps(aPtr);
122
123 cVal = _mm_rsqrt_ps(aVal);
124
125 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
126
127 aPtr += 4;
128 cPtr += 4;
129 }
130
131 number = quarterPoints * 4;
132 for (; number < num_points; number++) {
133 *cPtr++ = Q_rsqrt(*aPtr++);
134 }
135 }
136 #endif /* LV_HAVE_SSE */
137
138
139 #ifdef LV_HAVE_NEON
140 #include <arm_neon.h>
141
142 static inline void
143 volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
144 {
145 unsigned int number;
146 const unsigned int quarter_points = num_points / 4;
147
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
150 float32x4_t a_val, c_val;
151 for (number = 0; number < quarter_points; ++number) {
152 a_val = vld1q_f32(aPtr);
153 c_val = vrsqrteq_f32(a_val);
154 vst1q_f32(cPtr, c_val);
155 aPtr += 4;
156 cPtr += 4;
157 }
158
159 for (number = quarter_points * 4; number < num_points; number++)
160 *cPtr++ = Q_rsqrt(*aPtr++);
161 }
162 #endif /* LV_HAVE_NEON */
163
164
165 #ifdef LV_HAVE_GENERIC
166
167 static inline void volk_32f_invsqrt_32f_generic(float* cVector,
168 const float* aVector,
169 unsigned int num_points)
170 {
171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
174 for (number = 0; number < num_points; number++) {
175 *cPtr++ = Q_rsqrt(*aPtr++);
176 }
177 }
178 #endif /* LV_HAVE_GENERIC */
179
180 #ifdef LV_HAVE_AVX
181 #include <immintrin.h>
182
183 static inline void
184 volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
185 {
186 unsigned int number = 0;
187 const unsigned int eighthPoints = num_points / 8;
188
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 __m256 aVal, cVal;
192 for (; number < eighthPoints; number++) {
193 aVal = _mm256_loadu_ps(aPtr);
194 cVal = _mm256_rsqrt_ps(aVal);
195 _mm256_storeu_ps(cPtr, cVal);
196 aPtr += 8;
197 cPtr += 8;
198 }
199
200 number = eighthPoints * 8;
201 for (; number < num_points; number++)
202 *cPtr++ = Q_rsqrt(*aPtr++);
203 }
204 #endif /* LV_HAVE_AVX */
205
206 #endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
207