GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_s32f_stddev_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 158 158 100.0%
Functions: 5 5 100.0%
Branches: 23 28 82.1%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_s32f_stddev_32f
12 *
13 * \b Overview
14 *
15 * Computes the standard deviation of the input buffer using the supplied mean.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float
20 * mean, unsigned int num_points) \endcode
21 *
22 * \b Inputs
23 * \li inputBuffer: The input vector of floats.
24 * \li mean: The mean of the input buffer.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li stddev: The output vector.
29 *
30 * \b Example
31 * Calculate the standard deviation from numbers generated with c++11's normal generator
32 * \code
33 * int N = 1000;
34 * unsigned int alignment = volk_get_alignment();
35 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
36 * float mean = 0.0f;
37 * float* stddev = (float*)volk_malloc(sizeof(float), alignment);
38 *
39 * // Use a normal generator with 0 mean, stddev = 1
40 * std::default_random_engine generator;
41 * std::normal_distribution<float> distribution(mean,1);
42 *
43 * for(unsigned int ii = 0; ii < N; ++ii){
44 * increasing[ii] = distribution(generator);
45 * }
46 *
47 * volk_32f_s32f_power_32f(stddev, increasing, mean, N);
48 *
49 * printf("std. dev. = %f\n", *stddev);
50 *
51 * volk_free(increasing);
52 * \endcode
53 */
54
55 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
56 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
57
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <volk/volk_common.h>
62
63 #ifdef LV_HAVE_SSE4_1
64 #include <smmintrin.h>
65
66 2 static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
67 const float* inputBuffer,
68 const float mean,
69 unsigned int num_points)
70 {
71 2 float returnValue = 0;
72
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points > 0) {
73 2 unsigned int number = 0;
74 2 const unsigned int sixteenthPoints = num_points / 16;
75
76 2 const float* aPtr = inputBuffer;
77
78 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
79
80 2 __m128 squareAccumulator = _mm_setzero_ps();
81 __m128 aVal1, aVal2, aVal3, aVal4;
82 __m128 cVal1, cVal2, cVal3, cVal4;
83
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
84 16382 aVal1 = _mm_load_ps(aPtr);
85 16382 aPtr += 4;
86 16382 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
87
88 16382 aVal2 = _mm_load_ps(aPtr);
89 16382 aPtr += 4;
90 16382 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
91
92 16382 aVal3 = _mm_load_ps(aPtr);
93 16382 aPtr += 4;
94 16382 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
95
96 16382 aVal4 = _mm_load_ps(aPtr);
97 16382 aPtr += 4;
98 16382 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
99
100 16382 cVal1 = _mm_or_ps(cVal1, cVal2);
101 16382 cVal3 = _mm_or_ps(cVal3, cVal4);
102 16382 cVal1 = _mm_or_ps(cVal1, cVal3);
103
104 squareAccumulator =
105 16382 _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
106 }
107 _mm_store_ps(squareBuffer,
108 squareAccumulator); // Store the results back into the C container
109 2 returnValue = squareBuffer[0];
110 2 returnValue += squareBuffer[1];
111 2 returnValue += squareBuffer[2];
112 2 returnValue += squareBuffer[3];
113
114 2 number = sixteenthPoints * 16;
115
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
116 30 returnValue += (*aPtr) * (*aPtr);
117 30 aPtr++;
118 }
119 2 returnValue /= num_points;
120 2 returnValue -= (mean * mean);
121 2 returnValue = sqrtf(returnValue);
122 }
123 2 *stddev = returnValue;
124 2 }
125
126 #endif /* LV_HAVE_SSE4_1 */
127
128 #ifdef LV_HAVE_SSE
129 #include <xmmintrin.h>
130
131 2 static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
132 const float* inputBuffer,
133 const float mean,
134 unsigned int num_points)
135 {
136 2 float returnValue = 0;
137
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points > 0) {
138 2 unsigned int number = 0;
139 2 const unsigned int quarterPoints = num_points / 4;
140
141 2 const float* aPtr = inputBuffer;
142
143 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
144
145 2 __m128 squareAccumulator = _mm_setzero_ps();
146 2 __m128 aVal = _mm_setzero_ps();
147
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
148 65534 aVal = _mm_load_ps(aPtr); // aVal = x
149 65534 aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
150 65534 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
151 65534 aPtr += 4;
152 }
153 _mm_store_ps(squareBuffer,
154 squareAccumulator); // Store the results back into the C container
155 2 returnValue = squareBuffer[0];
156 2 returnValue += squareBuffer[1];
157 2 returnValue += squareBuffer[2];
158 2 returnValue += squareBuffer[3];
159
160 2 number = quarterPoints * 4;
161
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
162 6 returnValue += (*aPtr) * (*aPtr);
163 6 aPtr++;
164 }
165 2 returnValue /= num_points;
166 2 returnValue -= (mean * mean);
167 2 returnValue = sqrtf(returnValue);
168 }
169 2 *stddev = returnValue;
170 2 }
171 #endif /* LV_HAVE_SSE */
172
173
174 #ifdef LV_HAVE_AVX
175 #include <immintrin.h>
176
177 2 static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
178 const float* inputBuffer,
179 const float mean,
180 unsigned int num_points)
181 {
182 2 float stdDev = 0;
183
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points > 0) {
184 2 unsigned int number = 0;
185 2 const unsigned int thirtySecondthPoints = num_points / 32;
186
187 2 const float* aPtr = inputBuffer;
188 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
189
190 2 __m256 squareAccumulator = _mm256_setzero_ps();
191 __m256 aVal1, aVal2, aVal3, aVal4;
192 __m256 cVal1, cVal2, cVal3, cVal4;
193
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (; number < thirtySecondthPoints; number++) {
194 8190 aVal1 = _mm256_load_ps(aPtr);
195 8190 aPtr += 8;
196 8190 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
197
198 8190 aVal2 = _mm256_load_ps(aPtr);
199 8190 aPtr += 8;
200 8190 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
201
202 8190 aVal3 = _mm256_load_ps(aPtr);
203 8190 aPtr += 8;
204 8190 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
205
206 8190 aVal4 = _mm256_load_ps(aPtr);
207 8190 aPtr += 8;
208 8190 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
209
210 8190 cVal1 = _mm256_or_ps(cVal1, cVal2);
211 8190 cVal3 = _mm256_or_ps(cVal3, cVal4);
212 8190 cVal1 = _mm256_or_ps(cVal1, cVal3);
213
214 squareAccumulator =
215 8190 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
216 }
217 _mm256_store_ps(squareBuffer,
218 squareAccumulator); // Store the results back into the C container
219 2 stdDev = squareBuffer[0];
220 2 stdDev += squareBuffer[1];
221 2 stdDev += squareBuffer[2];
222 2 stdDev += squareBuffer[3];
223 2 stdDev += squareBuffer[4];
224 2 stdDev += squareBuffer[5];
225 2 stdDev += squareBuffer[6];
226 2 stdDev += squareBuffer[7];
227
228 2 number = thirtySecondthPoints * 32;
229
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
230 62 stdDev += (*aPtr) * (*aPtr);
231 62 aPtr++;
232 }
233 2 stdDev /= num_points;
234 2 stdDev -= (mean * mean);
235 2 stdDev = sqrtf(stdDev);
236 }
237 2 *stddev = stdDev;
238 2 }
239 #endif /* LV_HAVE_AVX */
240
241
242 #ifdef LV_HAVE_GENERIC
243
244 2 static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
245 const float* inputBuffer,
246 const float mean,
247 unsigned int num_points)
248 {
249 2 float returnValue = 0;
250
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points > 0) {
251 2 const float* aPtr = inputBuffer;
252 2 unsigned int number = 0;
253
254
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
255 262142 returnValue += (*aPtr) * (*aPtr);
256 262142 aPtr++;
257 }
258
259 2 returnValue /= num_points;
260 2 returnValue -= (mean * mean);
261 2 returnValue = sqrtf(returnValue);
262 }
263 2 *stddev = returnValue;
264 2 }
265
266 #endif /* LV_HAVE_GENERIC */
267
268
269 #endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
270
271 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
272 #define INCLUDED_volk_32f_s32f_stddev_32f_u_H
273
274 #include <inttypes.h>
275 #include <math.h>
276 #include <stdio.h>
277 #include <volk/volk_common.h>
278
279 #ifdef LV_HAVE_AVX
280 #include <immintrin.h>
281
282 2 static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
283 const float* inputBuffer,
284 const float mean,
285 unsigned int num_points)
286 {
287 2 float stdDev = 0;
288
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points > 0) {
289 2 unsigned int number = 0;
290 2 const unsigned int thirtySecondthPoints = num_points / 32;
291
292 2 const float* aPtr = inputBuffer;
293 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
294
295 2 __m256 squareAccumulator = _mm256_setzero_ps();
296 __m256 aVal1, aVal2, aVal3, aVal4;
297 __m256 cVal1, cVal2, cVal3, cVal4;
298
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (; number < thirtySecondthPoints; number++) {
299 8190 aVal1 = _mm256_loadu_ps(aPtr);
300 8190 aPtr += 8;
301 8190 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
302
303 8190 aVal2 = _mm256_loadu_ps(aPtr);
304 8190 aPtr += 8;
305 8190 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
306
307 8190 aVal3 = _mm256_loadu_ps(aPtr);
308 8190 aPtr += 8;
309 8190 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
310
311 8190 aVal4 = _mm256_loadu_ps(aPtr);
312 8190 aPtr += 8;
313 8190 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
314
315 8190 cVal1 = _mm256_or_ps(cVal1, cVal2);
316 8190 cVal3 = _mm256_or_ps(cVal3, cVal4);
317 8190 cVal1 = _mm256_or_ps(cVal1, cVal3);
318
319 squareAccumulator =
320 8190 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
321 }
322 _mm256_storeu_ps(
323 squareBuffer,
324 squareAccumulator); // Store the results back into the C container
325 2 stdDev = squareBuffer[0];
326 2 stdDev += squareBuffer[1];
327 2 stdDev += squareBuffer[2];
328 2 stdDev += squareBuffer[3];
329 2 stdDev += squareBuffer[4];
330 2 stdDev += squareBuffer[5];
331 2 stdDev += squareBuffer[6];
332 2 stdDev += squareBuffer[7];
333
334 2 number = thirtySecondthPoints * 32;
335
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
336 62 stdDev += (*aPtr) * (*aPtr);
337 62 aPtr++;
338 }
339 2 stdDev /= num_points;
340 2 stdDev -= (mean * mean);
341 2 stdDev = sqrtf(stdDev);
342 }
343 2 *stddev = stdDev;
344 2 }
345 #endif /* LV_HAVE_AVX */
346
347 #endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */
348