GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_sqrt_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 49 49 100.0%
Functions: 4 4 100.0%
Branches: 14 14 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_sqrt_32f
12 *
13 * \b Overview
14 *
15 * Computes the square root of the input vector and stores the results
16 * in the output vector.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_sqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
21 * \endcode
22 *
23 * \b Inputs
24 * \li aVector: The input vector of floats.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li bVector: The output vector.
29 *
30 * \b Example
31 * \code
32 int N = 10;
33 unsigned int alignment = volk_get_alignment();
34 float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
35 float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36
37 for(unsigned int ii = 0; ii < N; ++ii){
38 in[ii] = (float)(ii*ii);
39 }
40
41 volk_32f_sqrt_32f(out, in, N);
42
43 for(unsigned int ii = 0; ii < N; ++ii){
44 printf("out(%i) = %f\n", ii, out[ii]);
45 }
46
47 volk_free(in);
48 volk_free(out);
49 * \endcode
50 */
51
52 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53 #define INCLUDED_volk_32f_sqrt_32f_a_H
54
55 #include <inttypes.h>
56 #include <math.h>
57 #include <stdio.h>
58
59 #ifdef LV_HAVE_SSE
60 #include <xmmintrin.h>
61
62 static inline void
63 2 volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
64 {
65 2 unsigned int number = 0;
66 2 const unsigned int quarterPoints = num_points / 4;
67
68 2 float* cPtr = cVector;
69 2 const float* aPtr = aVector;
70
71 __m128 aVal, cVal;
72
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
73 65534 aVal = _mm_load_ps(aPtr);
74
75 65534 cVal = _mm_sqrt_ps(aVal);
76
77 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
78
79 65534 aPtr += 4;
80 65534 cPtr += 4;
81 }
82
83 2 number = quarterPoints * 4;
84
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
85 6 *cPtr++ = sqrtf(*aPtr++);
86 }
87 2 }
88
89 #endif /* LV_HAVE_SSE */
90
91 #ifdef LV_HAVE_AVX
92 #include <immintrin.h>
93
94 static inline void
95 2 volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
96 {
97 2 unsigned int number = 0;
98 2 const unsigned int eighthPoints = num_points / 8;
99
100 2 float* cPtr = cVector;
101 2 const float* aPtr = aVector;
102
103 __m256 aVal, cVal;
104
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
105 32766 aVal = _mm256_load_ps(aPtr);
106
107 32766 cVal = _mm256_sqrt_ps(aVal);
108
109 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
110
111 32766 aPtr += 8;
112 32766 cPtr += 8;
113 }
114
115 2 number = eighthPoints * 8;
116
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
117 14 *cPtr++ = sqrtf(*aPtr++);
118 }
119 2 }
120
121 #endif /* LV_HAVE_AVX */
122
123
124 #ifdef LV_HAVE_NEON
125 #include <arm_neon.h>
126
127 static inline void
128 volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
129 {
130 float* cPtr = cVector;
131 const float* aPtr = aVector;
132 unsigned int number = 0;
133 unsigned int quarter_points = num_points / 4;
134 float32x4_t in_vec, out_vec;
135
136 for (number = 0; number < quarter_points; number++) {
137 in_vec = vld1q_f32(aPtr);
138 // note that armv8 has vsqrt_f32 which will be much better
139 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140 vst1q_f32(cPtr, out_vec);
141 aPtr += 4;
142 cPtr += 4;
143 }
144
145 for (number = quarter_points * 4; number < num_points; number++) {
146 *cPtr++ = sqrtf(*aPtr++);
147 }
148 }
149
150 #endif /* LV_HAVE_NEON */
151
152
153 #ifdef LV_HAVE_GENERIC
154
155 static inline void
156 2 volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
157 {
158 2 float* cPtr = cVector;
159 2 const float* aPtr = aVector;
160 2 unsigned int number = 0;
161
162
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
163 262142 *cPtr++ = sqrtf(*aPtr++);
164 }
165 2 }
166
167 #endif /* LV_HAVE_GENERIC */
168
169 #endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
170
171 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
172 #define INCLUDED_volk_32f_sqrt_32f_u_H
173
174 #include <inttypes.h>
175 #include <math.h>
176 #include <stdio.h>
177 #ifdef LV_HAVE_AVX
178 #include <immintrin.h>
179
180 static inline void
181 2 volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
182 {
183 2 unsigned int number = 0;
184 2 const unsigned int eighthPoints = num_points / 8;
185
186 2 float* cPtr = cVector;
187 2 const float* aPtr = aVector;
188
189 __m256 aVal, cVal;
190
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
191 32766 aVal = _mm256_loadu_ps(aPtr);
192
193 32766 cVal = _mm256_sqrt_ps(aVal);
194
195 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
196
197 32766 aPtr += 8;
198 32766 cPtr += 8;
199 }
200
201 2 number = eighthPoints * 8;
202
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
203 14 *cPtr++ = sqrtf(*aPtr++);
204 }
205 2 }
206
207 #endif /* LV_HAVE_AVX */
208 #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
209