Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_s32f_stddev_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the standard deviation of the input buffer using the supplied mean. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float | ||
20 | * mean, unsigned int num_points) \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li inputBuffer: The input vector of floats. | ||
24 | * \li mean: The mean of the input buffer. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li stddev: The output vector. | ||
29 | * | ||
30 | * \b Example | ||
31 | * Calculate the standard deviation from numbers generated with c++11's normal generator | ||
32 | * \code | ||
33 | * int N = 1000; | ||
34 | * unsigned int alignment = volk_get_alignment(); | ||
35 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | * float mean = 0.0f; | ||
37 | * float* stddev = (float*)volk_malloc(sizeof(float), alignment); | ||
38 | * | ||
39 | * // Use a normal generator with 0 mean, stddev = 1 | ||
40 | * std::default_random_engine generator; | ||
41 | * std::normal_distribution<float> distribution(mean,1); | ||
42 | * | ||
43 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
44 | * increasing[ii] = distribution(generator); | ||
45 | * } | ||
46 | * | ||
47 | * volk_32f_s32f_power_32f(stddev, increasing, mean, N); | ||
48 | * | ||
49 | * printf("std. dev. = %f\n", *stddev); | ||
50 | * | ||
51 | * volk_free(increasing); | ||
52 | * \endcode | ||
53 | */ | ||
54 | |||
55 | #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H | ||
56 | #define INCLUDED_volk_32f_s32f_stddev_32f_a_H | ||
57 | |||
58 | #include <inttypes.h> | ||
59 | #include <math.h> | ||
60 | #include <stdio.h> | ||
61 | #include <volk/volk_common.h> | ||
62 | |||
63 | #ifdef LV_HAVE_SSE4_1 | ||
64 | #include <smmintrin.h> | ||
65 | |||
66 | 2 | static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, | |
67 | const float* inputBuffer, | ||
68 | const float mean, | ||
69 | unsigned int num_points) | ||
70 | { | ||
71 | 2 | float returnValue = 0; | |
72 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
73 | 2 | unsigned int number = 0; | |
74 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
75 | |||
76 | 2 | const float* aPtr = inputBuffer; | |
77 | |||
78 | __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; | ||
79 | |||
80 | 2 | __m128 squareAccumulator = _mm_setzero_ps(); | |
81 | __m128 aVal1, aVal2, aVal3, aVal4; | ||
82 | __m128 cVal1, cVal2, cVal3, cVal4; | ||
83 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
84 | 16382 | aVal1 = _mm_load_ps(aPtr); | |
85 | 16382 | aPtr += 4; | |
86 | 16382 | cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); | |
87 | |||
88 | 16382 | aVal2 = _mm_load_ps(aPtr); | |
89 | 16382 | aPtr += 4; | |
90 | 16382 | cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); | |
91 | |||
92 | 16382 | aVal3 = _mm_load_ps(aPtr); | |
93 | 16382 | aPtr += 4; | |
94 | 16382 | cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); | |
95 | |||
96 | 16382 | aVal4 = _mm_load_ps(aPtr); | |
97 | 16382 | aPtr += 4; | |
98 | 16382 | cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); | |
99 | |||
100 | 16382 | cVal1 = _mm_or_ps(cVal1, cVal2); | |
101 | 16382 | cVal3 = _mm_or_ps(cVal3, cVal4); | |
102 | 16382 | cVal1 = _mm_or_ps(cVal1, cVal3); | |
103 | |||
104 | squareAccumulator = | ||
105 | 16382 | _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 | |
106 | } | ||
107 | _mm_store_ps(squareBuffer, | ||
108 | squareAccumulator); // Store the results back into the C container | ||
109 | 2 | returnValue = squareBuffer[0]; | |
110 | 2 | returnValue += squareBuffer[1]; | |
111 | 2 | returnValue += squareBuffer[2]; | |
112 | 2 | returnValue += squareBuffer[3]; | |
113 | |||
114 | 2 | number = sixteenthPoints * 16; | |
115 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
116 | 30 | returnValue += (*aPtr) * (*aPtr); | |
117 | 30 | aPtr++; | |
118 | } | ||
119 | 2 | returnValue /= num_points; | |
120 | 2 | returnValue -= (mean * mean); | |
121 | 2 | returnValue = sqrtf(returnValue); | |
122 | } | ||
123 | 2 | *stddev = returnValue; | |
124 | 2 | } | |
125 | |||
126 | #endif /* LV_HAVE_SSE4_1 */ | ||
127 | |||
128 | #ifdef LV_HAVE_SSE | ||
129 | #include <xmmintrin.h> | ||
130 | |||
131 | 2 | static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, | |
132 | const float* inputBuffer, | ||
133 | const float mean, | ||
134 | unsigned int num_points) | ||
135 | { | ||
136 | 2 | float returnValue = 0; | |
137 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
138 | 2 | unsigned int number = 0; | |
139 | 2 | const unsigned int quarterPoints = num_points / 4; | |
140 | |||
141 | 2 | const float* aPtr = inputBuffer; | |
142 | |||
143 | __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; | ||
144 | |||
145 | 2 | __m128 squareAccumulator = _mm_setzero_ps(); | |
146 | 2 | __m128 aVal = _mm_setzero_ps(); | |
147 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
148 | 65534 | aVal = _mm_load_ps(aPtr); // aVal = x | |
149 | 65534 | aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 | |
150 | 65534 | squareAccumulator = _mm_add_ps(squareAccumulator, aVal); | |
151 | 65534 | aPtr += 4; | |
152 | } | ||
153 | _mm_store_ps(squareBuffer, | ||
154 | squareAccumulator); // Store the results back into the C container | ||
155 | 2 | returnValue = squareBuffer[0]; | |
156 | 2 | returnValue += squareBuffer[1]; | |
157 | 2 | returnValue += squareBuffer[2]; | |
158 | 2 | returnValue += squareBuffer[3]; | |
159 | |||
160 | 2 | number = quarterPoints * 4; | |
161 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
162 | 6 | returnValue += (*aPtr) * (*aPtr); | |
163 | 6 | aPtr++; | |
164 | } | ||
165 | 2 | returnValue /= num_points; | |
166 | 2 | returnValue -= (mean * mean); | |
167 | 2 | returnValue = sqrtf(returnValue); | |
168 | } | ||
169 | 2 | *stddev = returnValue; | |
170 | 2 | } | |
171 | #endif /* LV_HAVE_SSE */ | ||
172 | |||
173 | |||
174 | #ifdef LV_HAVE_AVX | ||
175 | #include <immintrin.h> | ||
176 | |||
177 | 2 | static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev, | |
178 | const float* inputBuffer, | ||
179 | const float mean, | ||
180 | unsigned int num_points) | ||
181 | { | ||
182 | 2 | float stdDev = 0; | |
183 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
184 | 2 | unsigned int number = 0; | |
185 | 2 | const unsigned int thirtySecondthPoints = num_points / 32; | |
186 | |||
187 | 2 | const float* aPtr = inputBuffer; | |
188 | __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; | ||
189 | |||
190 | 2 | __m256 squareAccumulator = _mm256_setzero_ps(); | |
191 | __m256 aVal1, aVal2, aVal3, aVal4; | ||
192 | __m256 cVal1, cVal2, cVal3, cVal4; | ||
193 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtySecondthPoints; number++) { |
194 | 8190 | aVal1 = _mm256_load_ps(aPtr); | |
195 | 8190 | aPtr += 8; | |
196 | 8190 | cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); | |
197 | |||
198 | 8190 | aVal2 = _mm256_load_ps(aPtr); | |
199 | 8190 | aPtr += 8; | |
200 | 8190 | cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); | |
201 | |||
202 | 8190 | aVal3 = _mm256_load_ps(aPtr); | |
203 | 8190 | aPtr += 8; | |
204 | 8190 | cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); | |
205 | |||
206 | 8190 | aVal4 = _mm256_load_ps(aPtr); | |
207 | 8190 | aPtr += 8; | |
208 | 8190 | cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); | |
209 | |||
210 | 8190 | cVal1 = _mm256_or_ps(cVal1, cVal2); | |
211 | 8190 | cVal3 = _mm256_or_ps(cVal3, cVal4); | |
212 | 8190 | cVal1 = _mm256_or_ps(cVal1, cVal3); | |
213 | |||
214 | squareAccumulator = | ||
215 | 8190 | _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 | |
216 | } | ||
217 | _mm256_store_ps(squareBuffer, | ||
218 | squareAccumulator); // Store the results back into the C container | ||
219 | 2 | stdDev = squareBuffer[0]; | |
220 | 2 | stdDev += squareBuffer[1]; | |
221 | 2 | stdDev += squareBuffer[2]; | |
222 | 2 | stdDev += squareBuffer[3]; | |
223 | 2 | stdDev += squareBuffer[4]; | |
224 | 2 | stdDev += squareBuffer[5]; | |
225 | 2 | stdDev += squareBuffer[6]; | |
226 | 2 | stdDev += squareBuffer[7]; | |
227 | |||
228 | 2 | number = thirtySecondthPoints * 32; | |
229 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
230 | 62 | stdDev += (*aPtr) * (*aPtr); | |
231 | 62 | aPtr++; | |
232 | } | ||
233 | 2 | stdDev /= num_points; | |
234 | 2 | stdDev -= (mean * mean); | |
235 | 2 | stdDev = sqrtf(stdDev); | |
236 | } | ||
237 | 2 | *stddev = stdDev; | |
238 | 2 | } | |
239 | #endif /* LV_HAVE_AVX */ | ||
240 | |||
241 | |||
242 | #ifdef LV_HAVE_GENERIC | ||
243 | |||
244 | 2 | static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, | |
245 | const float* inputBuffer, | ||
246 | const float mean, | ||
247 | unsigned int num_points) | ||
248 | { | ||
249 | 2 | float returnValue = 0; | |
250 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
251 | 2 | const float* aPtr = inputBuffer; | |
252 | 2 | unsigned int number = 0; | |
253 | |||
254 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
255 | 262142 | returnValue += (*aPtr) * (*aPtr); | |
256 | 262142 | aPtr++; | |
257 | } | ||
258 | |||
259 | 2 | returnValue /= num_points; | |
260 | 2 | returnValue -= (mean * mean); | |
261 | 2 | returnValue = sqrtf(returnValue); | |
262 | } | ||
263 | 2 | *stddev = returnValue; | |
264 | 2 | } | |
265 | |||
266 | #endif /* LV_HAVE_GENERIC */ | ||
267 | |||
268 | |||
269 | #endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */ | ||
270 | |||
271 | #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H | ||
272 | #define INCLUDED_volk_32f_s32f_stddev_32f_u_H | ||
273 | |||
274 | #include <inttypes.h> | ||
275 | #include <math.h> | ||
276 | #include <stdio.h> | ||
277 | #include <volk/volk_common.h> | ||
278 | |||
279 | #ifdef LV_HAVE_AVX | ||
280 | #include <immintrin.h> | ||
281 | |||
282 | 2 | static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev, | |
283 | const float* inputBuffer, | ||
284 | const float mean, | ||
285 | unsigned int num_points) | ||
286 | { | ||
287 | 2 | float stdDev = 0; | |
288 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points > 0) { |
289 | 2 | unsigned int number = 0; | |
290 | 2 | const unsigned int thirtySecondthPoints = num_points / 32; | |
291 | |||
292 | 2 | const float* aPtr = inputBuffer; | |
293 | __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; | ||
294 | |||
295 | 2 | __m256 squareAccumulator = _mm256_setzero_ps(); | |
296 | __m256 aVal1, aVal2, aVal3, aVal4; | ||
297 | __m256 cVal1, cVal2, cVal3, cVal4; | ||
298 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtySecondthPoints; number++) { |
299 | 8190 | aVal1 = _mm256_loadu_ps(aPtr); | |
300 | 8190 | aPtr += 8; | |
301 | 8190 | cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); | |
302 | |||
303 | 8190 | aVal2 = _mm256_loadu_ps(aPtr); | |
304 | 8190 | aPtr += 8; | |
305 | 8190 | cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); | |
306 | |||
307 | 8190 | aVal3 = _mm256_loadu_ps(aPtr); | |
308 | 8190 | aPtr += 8; | |
309 | 8190 | cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); | |
310 | |||
311 | 8190 | aVal4 = _mm256_loadu_ps(aPtr); | |
312 | 8190 | aPtr += 8; | |
313 | 8190 | cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); | |
314 | |||
315 | 8190 | cVal1 = _mm256_or_ps(cVal1, cVal2); | |
316 | 8190 | cVal3 = _mm256_or_ps(cVal3, cVal4); | |
317 | 8190 | cVal1 = _mm256_or_ps(cVal1, cVal3); | |
318 | |||
319 | squareAccumulator = | ||
320 | 8190 | _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 | |
321 | } | ||
322 | _mm256_storeu_ps( | ||
323 | squareBuffer, | ||
324 | squareAccumulator); // Store the results back into the C container | ||
325 | 2 | stdDev = squareBuffer[0]; | |
326 | 2 | stdDev += squareBuffer[1]; | |
327 | 2 | stdDev += squareBuffer[2]; | |
328 | 2 | stdDev += squareBuffer[3]; | |
329 | 2 | stdDev += squareBuffer[4]; | |
330 | 2 | stdDev += squareBuffer[5]; | |
331 | 2 | stdDev += squareBuffer[6]; | |
332 | 2 | stdDev += squareBuffer[7]; | |
333 | |||
334 | 2 | number = thirtySecondthPoints * 32; | |
335 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
336 | 62 | stdDev += (*aPtr) * (*aPtr); | |
337 | 62 | aPtr++; | |
338 | } | ||
339 | 2 | stdDev /= num_points; | |
340 | 2 | stdDev -= (mean * mean); | |
341 | 2 | stdDev = sqrtf(stdDev); | |
342 | } | ||
343 | 2 | *stddev = stdDev; | |
344 | 2 | } | |
345 | #endif /* LV_HAVE_AVX */ | ||
346 | |||
347 | #endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */ | ||
348 |