Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2017 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_s32f_s32f_mod_range_32f | ||
12 | * | ||
13 | * \b wraps floating point numbers to stay within a defined [min,max] range | ||
14 | * | ||
15 | * <b>Dispatcher Prototype</b> | ||
16 | * \code | ||
17 | * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, | ||
18 | * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode | ||
19 | * | ||
20 | * \b Inputs | ||
21 | * \li inputVector: The input vector | ||
22 | * \li lower_bound: The lower output boundary | ||
23 | * \li upper_bound: The upper output boundary | ||
24 | * \li num_points The number of data points. | ||
25 | * | ||
26 | * \b Outputs | ||
27 | * \li outputVector: The vector where the results will be stored. | ||
28 | * | ||
29 | * \endcode | ||
30 | */ | ||
31 | |||
32 | #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H | ||
33 | #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H | ||
34 | |||
35 | #ifdef LV_HAVE_GENERIC | ||
36 | |||
37 | 14 | static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, | |
38 | const float* inputVector, | ||
39 | const float lower_bound, | ||
40 | const float upper_bound, | ||
41 | unsigned int num_points) | ||
42 | { | ||
43 | 14 | float* outPtr = outputVector; | |
44 | const float* inPtr; | ||
45 | 14 | const float distance = upper_bound - lower_bound; | |
46 | |||
47 |
2/2✓ Branch 0 taken 262194 times.
✓ Branch 1 taken 14 times.
|
262208 | for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) { |
48 | 262194 | float val = *inPtr; | |
49 |
1/2✓ Branch 0 taken 262194 times.
✗ Branch 1 not taken.
|
262194 | if (val < lower_bound) { |
50 | 262194 | float excess = lower_bound - val; | |
51 | 262194 | signed int count = (int)(excess / distance); | |
52 | 262194 | *outPtr = val + (count + 1) * distance; | |
53 | ✗ | } else if (val > upper_bound) { | |
54 | ✗ | float excess = val - upper_bound; | |
55 | ✗ | signed int count = (int)(excess / distance); | |
56 | ✗ | *outPtr = val - (count + 1) * distance; | |
57 | } else | ||
58 | ✗ | *outPtr = val; | |
59 | 262194 | outPtr++; | |
60 | } | ||
61 | 14 | } | |
62 | #endif /* LV_HAVE_GENERIC */ | ||
63 | |||
64 | |||
65 | #ifdef LV_HAVE_AVX | ||
66 | #include <xmmintrin.h> | ||
67 | |||
68 | 2 | static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, | |
69 | const float* inputVector, | ||
70 | const float lower_bound, | ||
71 | const float upper_bound, | ||
72 | unsigned int num_points) | ||
73 | { | ||
74 | 2 | const __m256 lower = _mm256_set1_ps(lower_bound); | |
75 | 2 | const __m256 upper = _mm256_set1_ps(upper_bound); | |
76 | 2 | const __m256 distance = _mm256_sub_ps(upper, lower); | |
77 | __m256 input, output; | ||
78 | __m256 is_smaller, is_bigger; | ||
79 | __m256 excess, adj; | ||
80 | |||
81 | 2 | const float* inPtr = inputVector; | |
82 | 2 | float* outPtr = outputVector; | |
83 | 2 | const size_t eight_points = num_points / 8; | |
84 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (size_t counter = 0; counter < eight_points; counter++) { |
85 | 32766 | input = _mm256_loadu_ps(inPtr); | |
86 | // calculate mask: input < lower, input > upper | ||
87 | 32766 | is_smaller = _mm256_cmp_ps( | |
88 | input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling | ||
89 | 32766 | is_bigger = _mm256_cmp_ps( | |
90 | input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling | ||
91 | // find out how far we are out-of-bound – positive values! | ||
92 | 65532 | excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); | |
93 | excess = | ||
94 | 98298 | _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); | |
95 | // how many do we have to add? (int(excess/distance+1)*distance) | ||
96 | 32766 | excess = _mm256_div_ps(excess, distance); | |
97 | // round down | ||
98 | 65532 | excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); | |
99 | // plus 1 | ||
100 | 32766 | adj = _mm256_set1_ps(1.0f); | |
101 | 32766 | excess = _mm256_add_ps(excess, adj); | |
102 | // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} | ||
103 | 32766 | adj = _mm256_and_ps(adj, is_smaller); | |
104 | 98298 | adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); | |
105 | // scale by distance, sign | ||
106 | 65532 | excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); | |
107 | 32766 | output = _mm256_add_ps(input, excess); | |
108 | _mm256_storeu_ps(outPtr, output); | ||
109 | 32766 | inPtr += 8; | |
110 | 32766 | outPtr += 8; | |
111 | } | ||
112 | |||
113 | 2 | volk_32f_s32f_s32f_mod_range_32f_generic( | |
114 | outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8); | ||
115 | 2 | } | |
116 | 2 | static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, | |
117 | const float* inputVector, | ||
118 | const float lower_bound, | ||
119 | const float upper_bound, | ||
120 | unsigned int num_points) | ||
121 | { | ||
122 | 2 | const __m256 lower = _mm256_set1_ps(lower_bound); | |
123 | 2 | const __m256 upper = _mm256_set1_ps(upper_bound); | |
124 | 2 | const __m256 distance = _mm256_sub_ps(upper, lower); | |
125 | __m256 input, output; | ||
126 | __m256 is_smaller, is_bigger; | ||
127 | __m256 excess, adj; | ||
128 | |||
129 | 2 | const float* inPtr = inputVector; | |
130 | 2 | float* outPtr = outputVector; | |
131 | 2 | const size_t eight_points = num_points / 8; | |
132 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (size_t counter = 0; counter < eight_points; counter++) { |
133 | 32766 | input = _mm256_load_ps(inPtr); | |
134 | // calculate mask: input < lower, input > upper | ||
135 | 32766 | is_smaller = _mm256_cmp_ps( | |
136 | input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling | ||
137 | 32766 | is_bigger = _mm256_cmp_ps( | |
138 | input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling | ||
139 | // find out how far we are out-of-bound – positive values! | ||
140 | 65532 | excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); | |
141 | excess = | ||
142 | 98298 | _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); | |
143 | // how many do we have to add? (int(excess/distance+1)*distance) | ||
144 | 32766 | excess = _mm256_div_ps(excess, distance); | |
145 | // round down | ||
146 | 65532 | excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); | |
147 | // plus 1 | ||
148 | 32766 | adj = _mm256_set1_ps(1.0f); | |
149 | 32766 | excess = _mm256_add_ps(excess, adj); | |
150 | // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} | ||
151 | 32766 | adj = _mm256_and_ps(adj, is_smaller); | |
152 | 98298 | adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); | |
153 | // scale by distance, sign | ||
154 | 65532 | excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); | |
155 | 32766 | output = _mm256_add_ps(input, excess); | |
156 | _mm256_store_ps(outPtr, output); | ||
157 | 32766 | inPtr += 8; | |
158 | 32766 | outPtr += 8; | |
159 | } | ||
160 | |||
161 | 2 | volk_32f_s32f_s32f_mod_range_32f_generic( | |
162 | outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8); | ||
163 | 2 | } | |
164 | #endif /* LV_HAVE_AVX */ | ||
165 | |||
166 | |||
167 | #ifdef LV_HAVE_SSE2 | ||
168 | #include <xmmintrin.h> | ||
169 | |||
170 | 2 | static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, | |
171 | const float* inputVector, | ||
172 | const float lower_bound, | ||
173 | const float upper_bound, | ||
174 | unsigned int num_points) | ||
175 | { | ||
176 | 2 | const __m128 lower = _mm_set_ps1(lower_bound); | |
177 | 2 | const __m128 upper = _mm_set_ps1(upper_bound); | |
178 | 2 | const __m128 distance = _mm_sub_ps(upper, lower); | |
179 | __m128 input, output; | ||
180 | __m128 is_smaller, is_bigger; | ||
181 | __m128 excess, adj; | ||
182 | |||
183 | 2 | const float* inPtr = inputVector; | |
184 | 2 | float* outPtr = outputVector; | |
185 | 2 | const size_t quarter_points = num_points / 4; | |
186 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (size_t counter = 0; counter < quarter_points; counter++) { |
187 | 65534 | input = _mm_loadu_ps(inPtr); | |
188 | // calculate mask: input < lower, input > upper | ||
189 | 65534 | is_smaller = _mm_cmplt_ps(input, lower); | |
190 | 65534 | is_bigger = _mm_cmpgt_ps(input, upper); | |
191 | // find out how far we are out-of-bound – positive values! | ||
192 | 131068 | excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); | |
193 | 196602 | excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); | |
194 | // how many do we have to add? (int(excess/distance+1)*distance) | ||
195 | 65534 | excess = _mm_div_ps(excess, distance); | |
196 | // round down | ||
197 | 131068 | excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); | |
198 | // plus 1 | ||
199 | 65534 | adj = _mm_set_ps1(1.0f); | |
200 | 65534 | excess = _mm_add_ps(excess, adj); | |
201 | // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} | ||
202 | 65534 | adj = _mm_and_ps(adj, is_smaller); | |
203 | 196602 | adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); | |
204 | // scale by distance, sign | ||
205 | 131068 | excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); | |
206 | 65534 | output = _mm_add_ps(input, excess); | |
207 | _mm_storeu_ps(outPtr, output); | ||
208 | 65534 | inPtr += 4; | |
209 | 65534 | outPtr += 4; | |
210 | } | ||
211 | |||
212 | 2 | volk_32f_s32f_s32f_mod_range_32f_generic( | |
213 | outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4); | ||
214 | 2 | } | |
215 | 2 | static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, | |
216 | const float* inputVector, | ||
217 | const float lower_bound, | ||
218 | const float upper_bound, | ||
219 | unsigned int num_points) | ||
220 | { | ||
221 | 2 | const __m128 lower = _mm_set_ps1(lower_bound); | |
222 | 2 | const __m128 upper = _mm_set_ps1(upper_bound); | |
223 | 2 | const __m128 distance = _mm_sub_ps(upper, lower); | |
224 | __m128 input, output; | ||
225 | __m128 is_smaller, is_bigger; | ||
226 | __m128 excess, adj; | ||
227 | |||
228 | 2 | const float* inPtr = inputVector; | |
229 | 2 | float* outPtr = outputVector; | |
230 | 2 | const size_t quarter_points = num_points / 4; | |
231 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (size_t counter = 0; counter < quarter_points; counter++) { |
232 | 65534 | input = _mm_load_ps(inPtr); | |
233 | // calculate mask: input < lower, input > upper | ||
234 | 65534 | is_smaller = _mm_cmplt_ps(input, lower); | |
235 | 65534 | is_bigger = _mm_cmpgt_ps(input, upper); | |
236 | // find out how far we are out-of-bound – positive values! | ||
237 | 131068 | excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); | |
238 | 196602 | excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); | |
239 | // how many do we have to add? (int(excess/distance+1)*distance) | ||
240 | 65534 | excess = _mm_div_ps(excess, distance); | |
241 | // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 | ||
242 | // conversion. | ||
243 | 131068 | excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); | |
244 | // plus 1 | ||
245 | 65534 | adj = _mm_set_ps1(1.0f); | |
246 | 65534 | excess = _mm_add_ps(excess, adj); | |
247 | // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} | ||
248 | 65534 | adj = _mm_and_ps(adj, is_smaller); | |
249 | 196602 | adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); | |
250 | // scale by distance, sign | ||
251 | 131068 | excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); | |
252 | 65534 | output = _mm_add_ps(input, excess); | |
253 | _mm_store_ps(outPtr, output); | ||
254 | 65534 | inPtr += 4; | |
255 | 65534 | outPtr += 4; | |
256 | } | ||
257 | |||
258 | 2 | volk_32f_s32f_s32f_mod_range_32f_generic( | |
259 | outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4); | ||
260 | 2 | } | |
261 | #endif /* LV_HAVE_SSE2 */ | ||
262 | |||
263 | #ifdef LV_HAVE_SSE | ||
264 | #include <xmmintrin.h> | ||
265 | |||
266 | 2 | static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, | |
267 | const float* inputVector, | ||
268 | const float lower_bound, | ||
269 | const float upper_bound, | ||
270 | unsigned int num_points) | ||
271 | { | ||
272 | 2 | const __m128 lower = _mm_set_ps1(lower_bound); | |
273 | 2 | const __m128 upper = _mm_set_ps1(upper_bound); | |
274 | 2 | const __m128 distance = _mm_sub_ps(upper, lower); | |
275 | __m128 input, output; | ||
276 | __m128 is_smaller, is_bigger; | ||
277 | __m128 excess, adj; | ||
278 | __m128i rounddown; | ||
279 | |||
280 | 2 | const float* inPtr = inputVector; | |
281 | 2 | float* outPtr = outputVector; | |
282 | 2 | const size_t quarter_points = num_points / 4; | |
283 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (size_t counter = 0; counter < quarter_points; counter++) { |
284 | 65534 | input = _mm_loadu_ps(inPtr); | |
285 | // calculate mask: input < lower, input > upper | ||
286 | 65534 | is_smaller = _mm_cmplt_ps(input, lower); | |
287 | 65534 | is_bigger = _mm_cmpgt_ps(input, upper); | |
288 | // find out how far we are out-of-bound – positive values! | ||
289 | 131068 | excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); | |
290 | 196602 | excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); | |
291 | // how many do we have to add? (int(excess/distance+1)*distance) | ||
292 | 65534 | excess = _mm_div_ps(excess, distance); | |
293 | // round down – for some reason | ||
294 | 65534 | rounddown = _mm_cvttps_epi32(excess); | |
295 | 65534 | excess = _mm_cvtepi32_ps(rounddown); | |
296 | // plus 1 | ||
297 | 65534 | adj = _mm_set_ps1(1.0f); | |
298 | 65534 | excess = _mm_add_ps(excess, adj); | |
299 | // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} | ||
300 | 65534 | adj = _mm_and_ps(adj, is_smaller); | |
301 | 196602 | adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); | |
302 | // scale by distance, sign | ||
303 | 131068 | excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); | |
304 | 65534 | output = _mm_add_ps(input, excess); | |
305 | _mm_storeu_ps(outPtr, output); | ||
306 | 65534 | inPtr += 4; | |
307 | 65534 | outPtr += 4; | |
308 | } | ||
309 | |||
310 | 2 | volk_32f_s32f_s32f_mod_range_32f_generic( | |
311 | outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4); | ||
312 | 2 | } | |
313 | 2 | static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, | |
314 | const float* inputVector, | ||
315 | const float lower_bound, | ||
316 | const float upper_bound, | ||
317 | unsigned int num_points) | ||
318 | { | ||
319 | 2 | const __m128 lower = _mm_set_ps1(lower_bound); | |
320 | 2 | const __m128 upper = _mm_set_ps1(upper_bound); | |
321 | 2 | const __m128 distance = _mm_sub_ps(upper, lower); | |
322 | __m128 input, output; | ||
323 | __m128 is_smaller, is_bigger; | ||
324 | __m128 excess, adj; | ||
325 | __m128i rounddown; | ||
326 | |||
327 | 2 | const float* inPtr = inputVector; | |
328 | 2 | float* outPtr = outputVector; | |
329 | 2 | const size_t quarter_points = num_points / 4; | |
330 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (size_t counter = 0; counter < quarter_points; counter++) { |
331 | 65534 | input = _mm_load_ps(inPtr); | |
332 | // calculate mask: input < lower, input > upper | ||
333 | 65534 | is_smaller = _mm_cmplt_ps(input, lower); | |
334 | 65534 | is_bigger = _mm_cmpgt_ps(input, upper); | |
335 | // find out how far we are out-of-bound – positive values! | ||
336 | 131068 | excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); | |
337 | 196602 | excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); | |
338 | // how many do we have to add? (int(excess/distance+1)*distance) | ||
339 | 65534 | excess = _mm_div_ps(excess, distance); | |
340 | // round down | ||
341 | 65534 | rounddown = _mm_cvttps_epi32(excess); | |
342 | 65534 | excess = _mm_cvtepi32_ps(rounddown); | |
343 | // plus 1 | ||
344 | 65534 | adj = _mm_set_ps1(1.0f); | |
345 | 65534 | excess = _mm_add_ps(excess, adj); | |
346 | // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} | ||
347 | 65534 | adj = _mm_and_ps(adj, is_smaller); | |
348 | 196602 | adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); | |
349 | // scale by distance, sign | ||
350 | 131068 | excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); | |
351 | 65534 | output = _mm_add_ps(input, excess); | |
352 | _mm_store_ps(outPtr, output); | ||
353 | 65534 | inPtr += 4; | |
354 | 65534 | outPtr += 4; | |
355 | } | ||
356 | |||
357 | 2 | volk_32f_s32f_s32f_mod_range_32f_generic( | |
358 | outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4); | ||
359 | 2 | } | |
360 | #endif /* LV_HAVE_SSE */ | ||
361 | |||
362 | |||
363 | #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */ | ||
364 |