Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_sqrt_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the square root of the input vector and stores the results | ||
16 | * in the output vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_sqrt_32f(float* cVector, const float* aVector, unsigned int num_points) | ||
21 | * \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li aVector: The input vector of floats. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li bVector: The output vector. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | int N = 10; | ||
33 | unsigned int alignment = volk_get_alignment(); | ||
34 | float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
35 | float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | |||
37 | for(unsigned int ii = 0; ii < N; ++ii){ | ||
38 | in[ii] = (float)(ii*ii); | ||
39 | } | ||
40 | |||
41 | volk_32f_sqrt_32f(out, in, N); | ||
42 | |||
43 | for(unsigned int ii = 0; ii < N; ++ii){ | ||
44 | printf("out(%i) = %f\n", ii, out[ii]); | ||
45 | } | ||
46 | |||
47 | volk_free(in); | ||
48 | volk_free(out); | ||
49 | * \endcode | ||
50 | */ | ||
51 | |||
52 | #ifndef INCLUDED_volk_32f_sqrt_32f_a_H | ||
53 | #define INCLUDED_volk_32f_sqrt_32f_a_H | ||
54 | |||
55 | #include <inttypes.h> | ||
56 | #include <math.h> | ||
57 | #include <stdio.h> | ||
58 | |||
59 | #ifdef LV_HAVE_SSE | ||
60 | #include <xmmintrin.h> | ||
61 | |||
62 | static inline void | ||
63 | 2 | volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) | |
64 | { | ||
65 | 2 | unsigned int number = 0; | |
66 | 2 | const unsigned int quarterPoints = num_points / 4; | |
67 | |||
68 | 2 | float* cPtr = cVector; | |
69 | 2 | const float* aPtr = aVector; | |
70 | |||
71 | __m128 aVal, cVal; | ||
72 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
73 | 65534 | aVal = _mm_load_ps(aPtr); | |
74 | |||
75 | 65534 | cVal = _mm_sqrt_ps(aVal); | |
76 | |||
77 | _mm_store_ps(cPtr, cVal); // Store the results back into the C container | ||
78 | |||
79 | 65534 | aPtr += 4; | |
80 | 65534 | cPtr += 4; | |
81 | } | ||
82 | |||
83 | 2 | number = quarterPoints * 4; | |
84 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
85 | 6 | *cPtr++ = sqrtf(*aPtr++); | |
86 | } | ||
87 | 2 | } | |
88 | |||
89 | #endif /* LV_HAVE_SSE */ | ||
90 | |||
91 | #ifdef LV_HAVE_AVX | ||
92 | #include <immintrin.h> | ||
93 | |||
94 | static inline void | ||
95 | 2 | volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) | |
96 | { | ||
97 | 2 | unsigned int number = 0; | |
98 | 2 | const unsigned int eighthPoints = num_points / 8; | |
99 | |||
100 | 2 | float* cPtr = cVector; | |
101 | 2 | const float* aPtr = aVector; | |
102 | |||
103 | __m256 aVal, cVal; | ||
104 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
105 | 32766 | aVal = _mm256_load_ps(aPtr); | |
106 | |||
107 | 32766 | cVal = _mm256_sqrt_ps(aVal); | |
108 | |||
109 | _mm256_store_ps(cPtr, cVal); // Store the results back into the C container | ||
110 | |||
111 | 32766 | aPtr += 8; | |
112 | 32766 | cPtr += 8; | |
113 | } | ||
114 | |||
115 | 2 | number = eighthPoints * 8; | |
116 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
117 | 14 | *cPtr++ = sqrtf(*aPtr++); | |
118 | } | ||
119 | 2 | } | |
120 | |||
121 | #endif /* LV_HAVE_AVX */ | ||
122 | |||
123 | |||
124 | #ifdef LV_HAVE_NEON | ||
125 | #include <arm_neon.h> | ||
126 | |||
127 | static inline void | ||
128 | volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points) | ||
129 | { | ||
130 | float* cPtr = cVector; | ||
131 | const float* aPtr = aVector; | ||
132 | unsigned int number = 0; | ||
133 | unsigned int quarter_points = num_points / 4; | ||
134 | float32x4_t in_vec, out_vec; | ||
135 | |||
136 | for (number = 0; number < quarter_points; number++) { | ||
137 | in_vec = vld1q_f32(aPtr); | ||
138 | // note that armv8 has vsqrt_f32 which will be much better | ||
139 | out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec)); | ||
140 | vst1q_f32(cPtr, out_vec); | ||
141 | aPtr += 4; | ||
142 | cPtr += 4; | ||
143 | } | ||
144 | |||
145 | for (number = quarter_points * 4; number < num_points; number++) { | ||
146 | *cPtr++ = sqrtf(*aPtr++); | ||
147 | } | ||
148 | } | ||
149 | |||
150 | #endif /* LV_HAVE_NEON */ | ||
151 | |||
152 | |||
153 | #ifdef LV_HAVE_GENERIC | ||
154 | |||
155 | static inline void | ||
156 | 2 | volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points) | |
157 | { | ||
158 | 2 | float* cPtr = cVector; | |
159 | 2 | const float* aPtr = aVector; | |
160 | 2 | unsigned int number = 0; | |
161 | |||
162 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
163 | 262142 | *cPtr++ = sqrtf(*aPtr++); | |
164 | } | ||
165 | 2 | } | |
166 | |||
167 | #endif /* LV_HAVE_GENERIC */ | ||
168 | |||
169 | #endif /* INCLUDED_volk_32f_sqrt_32f_a_H */ | ||
170 | |||
171 | #ifndef INCLUDED_volk_32f_sqrt_32f_u_H | ||
172 | #define INCLUDED_volk_32f_sqrt_32f_u_H | ||
173 | |||
174 | #include <inttypes.h> | ||
175 | #include <math.h> | ||
176 | #include <stdio.h> | ||
177 | #ifdef LV_HAVE_AVX | ||
178 | #include <immintrin.h> | ||
179 | |||
180 | static inline void | ||
181 | 2 | volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) | |
182 | { | ||
183 | 2 | unsigned int number = 0; | |
184 | 2 | const unsigned int eighthPoints = num_points / 8; | |
185 | |||
186 | 2 | float* cPtr = cVector; | |
187 | 2 | const float* aPtr = aVector; | |
188 | |||
189 | __m256 aVal, cVal; | ||
190 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
191 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
192 | |||
193 | 32766 | cVal = _mm256_sqrt_ps(aVal); | |
194 | |||
195 | _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
196 | |||
197 | 32766 | aPtr += 8; | |
198 | 32766 | cPtr += 8; | |
199 | } | ||
200 | |||
201 | 2 | number = eighthPoints * 8; | |
202 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
203 | 14 | *cPtr++ = sqrtf(*aPtr++); | |
204 | } | ||
205 | 2 | } | |
206 | |||
207 | #endif /* LV_HAVE_AVX */ | ||
208 | #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */ | ||
209 |