GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 163 168 97.0%
Functions: 7 7 100.0%
Branches: 15 18 83.3%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2017 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_s32f_s32f_mod_range_32f
12 *
13 * \b wraps floating point numbers to stay within a defined [min,max] range
14 *
15 * <b>Dispatcher Prototype</b>
16 * \code
17 * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
18 * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
19 *
20 * \b Inputs
21 * \li inputVector: The input vector
22 * \li lower_bound: The lower output boundary
23 * \li upper_bound: The upper output boundary
24 * \li num_points The number of data points.
25 *
26 * \b Outputs
27 * \li outputVector: The vector where the results will be stored.
28 *
29 * \endcode
30 */
31
32 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
34
35 #ifdef LV_HAVE_GENERIC
36
37 14 static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
38 const float* inputVector,
39 const float lower_bound,
40 const float upper_bound,
41 unsigned int num_points)
42 {
43 14 float* outPtr = outputVector;
44 const float* inPtr;
45 14 const float distance = upper_bound - lower_bound;
46
47
2/2
✓ Branch 0 taken 262194 times.
✓ Branch 1 taken 14 times.
262208 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
48 262194 float val = *inPtr;
49
1/2
✓ Branch 0 taken 262194 times.
✗ Branch 1 not taken.
262194 if (val < lower_bound) {
50 262194 float excess = lower_bound - val;
51 262194 signed int count = (int)(excess / distance);
52 262194 *outPtr = val + (count + 1) * distance;
53 } else if (val > upper_bound) {
54 float excess = val - upper_bound;
55 signed int count = (int)(excess / distance);
56 *outPtr = val - (count + 1) * distance;
57 } else
58 *outPtr = val;
59 262194 outPtr++;
60 }
61 14 }
62 #endif /* LV_HAVE_GENERIC */
63
64
65 #ifdef LV_HAVE_AVX
66 #include <xmmintrin.h>
67
68 2 static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
69 const float* inputVector,
70 const float lower_bound,
71 const float upper_bound,
72 unsigned int num_points)
73 {
74 2 const __m256 lower = _mm256_set1_ps(lower_bound);
75 2 const __m256 upper = _mm256_set1_ps(upper_bound);
76 2 const __m256 distance = _mm256_sub_ps(upper, lower);
77 __m256 input, output;
78 __m256 is_smaller, is_bigger;
79 __m256 excess, adj;
80
81 2 const float* inPtr = inputVector;
82 2 float* outPtr = outputVector;
83 2 const size_t eight_points = num_points / 8;
84
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (size_t counter = 0; counter < eight_points; counter++) {
85 32766 input = _mm256_loadu_ps(inPtr);
86 // calculate mask: input < lower, input > upper
87 32766 is_smaller = _mm256_cmp_ps(
88 input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
89 32766 is_bigger = _mm256_cmp_ps(
90 input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
91 // find out how far we are out-of-bound – positive values!
92 65532 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
93 excess =
94 98298 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
95 // how many do we have to add? (int(excess/distance+1)*distance)
96 32766 excess = _mm256_div_ps(excess, distance);
97 // round down
98 65532 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
99 // plus 1
100 32766 adj = _mm256_set1_ps(1.0f);
101 32766 excess = _mm256_add_ps(excess, adj);
102 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
103 32766 adj = _mm256_and_ps(adj, is_smaller);
104 98298 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
105 // scale by distance, sign
106 65532 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107 32766 output = _mm256_add_ps(input, excess);
108 _mm256_storeu_ps(outPtr, output);
109 32766 inPtr += 8;
110 32766 outPtr += 8;
111 }
112
113 2 volk_32f_s32f_s32f_mod_range_32f_generic(
114 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
115 2 }
116 2 static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
117 const float* inputVector,
118 const float lower_bound,
119 const float upper_bound,
120 unsigned int num_points)
121 {
122 2 const __m256 lower = _mm256_set1_ps(lower_bound);
123 2 const __m256 upper = _mm256_set1_ps(upper_bound);
124 2 const __m256 distance = _mm256_sub_ps(upper, lower);
125 __m256 input, output;
126 __m256 is_smaller, is_bigger;
127 __m256 excess, adj;
128
129 2 const float* inPtr = inputVector;
130 2 float* outPtr = outputVector;
131 2 const size_t eight_points = num_points / 8;
132
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (size_t counter = 0; counter < eight_points; counter++) {
133 32766 input = _mm256_load_ps(inPtr);
134 // calculate mask: input < lower, input > upper
135 32766 is_smaller = _mm256_cmp_ps(
136 input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
137 32766 is_bigger = _mm256_cmp_ps(
138 input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
139 // find out how far we are out-of-bound – positive values!
140 65532 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
141 excess =
142 98298 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
143 // how many do we have to add? (int(excess/distance+1)*distance)
144 32766 excess = _mm256_div_ps(excess, distance);
145 // round down
146 65532 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
147 // plus 1
148 32766 adj = _mm256_set1_ps(1.0f);
149 32766 excess = _mm256_add_ps(excess, adj);
150 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
151 32766 adj = _mm256_and_ps(adj, is_smaller);
152 98298 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
153 // scale by distance, sign
154 65532 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155 32766 output = _mm256_add_ps(input, excess);
156 _mm256_store_ps(outPtr, output);
157 32766 inPtr += 8;
158 32766 outPtr += 8;
159 }
160
161 2 volk_32f_s32f_s32f_mod_range_32f_generic(
162 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
163 2 }
164 #endif /* LV_HAVE_AVX */
165
166
167 #ifdef LV_HAVE_SSE2
168 #include <xmmintrin.h>
169
170 2 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
171 const float* inputVector,
172 const float lower_bound,
173 const float upper_bound,
174 unsigned int num_points)
175 {
176 2 const __m128 lower = _mm_set_ps1(lower_bound);
177 2 const __m128 upper = _mm_set_ps1(upper_bound);
178 2 const __m128 distance = _mm_sub_ps(upper, lower);
179 __m128 input, output;
180 __m128 is_smaller, is_bigger;
181 __m128 excess, adj;
182
183 2 const float* inPtr = inputVector;
184 2 float* outPtr = outputVector;
185 2 const size_t quarter_points = num_points / 4;
186
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (size_t counter = 0; counter < quarter_points; counter++) {
187 65534 input = _mm_loadu_ps(inPtr);
188 // calculate mask: input < lower, input > upper
189 65534 is_smaller = _mm_cmplt_ps(input, lower);
190 65534 is_bigger = _mm_cmpgt_ps(input, upper);
191 // find out how far we are out-of-bound – positive values!
192 131068 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
193 196602 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
194 // how many do we have to add? (int(excess/distance+1)*distance)
195 65534 excess = _mm_div_ps(excess, distance);
196 // round down
197 131068 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
198 // plus 1
199 65534 adj = _mm_set_ps1(1.0f);
200 65534 excess = _mm_add_ps(excess, adj);
201 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
202 65534 adj = _mm_and_ps(adj, is_smaller);
203 196602 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
204 // scale by distance, sign
205 131068 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
206 65534 output = _mm_add_ps(input, excess);
207 _mm_storeu_ps(outPtr, output);
208 65534 inPtr += 4;
209 65534 outPtr += 4;
210 }
211
212 2 volk_32f_s32f_s32f_mod_range_32f_generic(
213 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
214 2 }
215 2 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
216 const float* inputVector,
217 const float lower_bound,
218 const float upper_bound,
219 unsigned int num_points)
220 {
221 2 const __m128 lower = _mm_set_ps1(lower_bound);
222 2 const __m128 upper = _mm_set_ps1(upper_bound);
223 2 const __m128 distance = _mm_sub_ps(upper, lower);
224 __m128 input, output;
225 __m128 is_smaller, is_bigger;
226 __m128 excess, adj;
227
228 2 const float* inPtr = inputVector;
229 2 float* outPtr = outputVector;
230 2 const size_t quarter_points = num_points / 4;
231
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (size_t counter = 0; counter < quarter_points; counter++) {
232 65534 input = _mm_load_ps(inPtr);
233 // calculate mask: input < lower, input > upper
234 65534 is_smaller = _mm_cmplt_ps(input, lower);
235 65534 is_bigger = _mm_cmpgt_ps(input, upper);
236 // find out how far we are out-of-bound – positive values!
237 131068 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
238 196602 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
239 // how many do we have to add? (int(excess/distance+1)*distance)
240 65534 excess = _mm_div_ps(excess, distance);
241 // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
242 // conversion.
243 131068 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
244 // plus 1
245 65534 adj = _mm_set_ps1(1.0f);
246 65534 excess = _mm_add_ps(excess, adj);
247 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
248 65534 adj = _mm_and_ps(adj, is_smaller);
249 196602 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
250 // scale by distance, sign
251 131068 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
252 65534 output = _mm_add_ps(input, excess);
253 _mm_store_ps(outPtr, output);
254 65534 inPtr += 4;
255 65534 outPtr += 4;
256 }
257
258 2 volk_32f_s32f_s32f_mod_range_32f_generic(
259 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
260 2 }
261 #endif /* LV_HAVE_SSE2 */
262
263 #ifdef LV_HAVE_SSE
264 #include <xmmintrin.h>
265
266 2 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
267 const float* inputVector,
268 const float lower_bound,
269 const float upper_bound,
270 unsigned int num_points)
271 {
272 2 const __m128 lower = _mm_set_ps1(lower_bound);
273 2 const __m128 upper = _mm_set_ps1(upper_bound);
274 2 const __m128 distance = _mm_sub_ps(upper, lower);
275 __m128 input, output;
276 __m128 is_smaller, is_bigger;
277 __m128 excess, adj;
278 __m128i rounddown;
279
280 2 const float* inPtr = inputVector;
281 2 float* outPtr = outputVector;
282 2 const size_t quarter_points = num_points / 4;
283
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (size_t counter = 0; counter < quarter_points; counter++) {
284 65534 input = _mm_loadu_ps(inPtr);
285 // calculate mask: input < lower, input > upper
286 65534 is_smaller = _mm_cmplt_ps(input, lower);
287 65534 is_bigger = _mm_cmpgt_ps(input, upper);
288 // find out how far we are out-of-bound – positive values!
289 131068 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
290 196602 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
291 // how many do we have to add? (int(excess/distance+1)*distance)
292 65534 excess = _mm_div_ps(excess, distance);
293 // round down – for some reason
294 65534 rounddown = _mm_cvttps_epi32(excess);
295 65534 excess = _mm_cvtepi32_ps(rounddown);
296 // plus 1
297 65534 adj = _mm_set_ps1(1.0f);
298 65534 excess = _mm_add_ps(excess, adj);
299 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
300 65534 adj = _mm_and_ps(adj, is_smaller);
301 196602 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
302 // scale by distance, sign
303 131068 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
304 65534 output = _mm_add_ps(input, excess);
305 _mm_storeu_ps(outPtr, output);
306 65534 inPtr += 4;
307 65534 outPtr += 4;
308 }
309
310 2 volk_32f_s32f_s32f_mod_range_32f_generic(
311 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
312 2 }
313 2 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
314 const float* inputVector,
315 const float lower_bound,
316 const float upper_bound,
317 unsigned int num_points)
318 {
319 2 const __m128 lower = _mm_set_ps1(lower_bound);
320 2 const __m128 upper = _mm_set_ps1(upper_bound);
321 2 const __m128 distance = _mm_sub_ps(upper, lower);
322 __m128 input, output;
323 __m128 is_smaller, is_bigger;
324 __m128 excess, adj;
325 __m128i rounddown;
326
327 2 const float* inPtr = inputVector;
328 2 float* outPtr = outputVector;
329 2 const size_t quarter_points = num_points / 4;
330
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (size_t counter = 0; counter < quarter_points; counter++) {
331 65534 input = _mm_load_ps(inPtr);
332 // calculate mask: input < lower, input > upper
333 65534 is_smaller = _mm_cmplt_ps(input, lower);
334 65534 is_bigger = _mm_cmpgt_ps(input, upper);
335 // find out how far we are out-of-bound – positive values!
336 131068 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
337 196602 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
338 // how many do we have to add? (int(excess/distance+1)*distance)
339 65534 excess = _mm_div_ps(excess, distance);
340 // round down
341 65534 rounddown = _mm_cvttps_epi32(excess);
342 65534 excess = _mm_cvtepi32_ps(rounddown);
343 // plus 1
344 65534 adj = _mm_set_ps1(1.0f);
345 65534 excess = _mm_add_ps(excess, adj);
346 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
347 65534 adj = _mm_and_ps(adj, is_smaller);
348 196602 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
349 // scale by distance, sign
350 131068 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
351 65534 output = _mm_add_ps(input, excess);
352 _mm_store_ps(outPtr, output);
353 65534 inPtr += 4;
354 65534 outPtr += 4;
355 }
356
357 2 volk_32f_s32f_s32f_mod_range_32f_generic(
358 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
359 2 }
360 #endif /* LV_HAVE_SSE */
361
362
363 #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
364