Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_binary_slicer_8i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Slices input floats and and returns 1 when the input >= 0 and 0 | ||
16 | * when < 0. Results are converted to 8-bit chars. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int | ||
21 | num_points) | ||
22 | * \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li aVector: The input vector of floats. | ||
26 | * \li num_points: The number of data points. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li cVector: The output vector of 8-bit chars. | ||
30 | * | ||
31 | * \b Example | ||
32 | * Generate bytes of a 7-bit barker code from floats. | ||
33 | * \code | ||
34 | int N = 7; | ||
35 | unsigned int alignment = volk_get_alignment(); | ||
36 | float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
37 | int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment); | ||
38 | |||
39 | in[0] = 0.9f; | ||
40 | in[1] = 1.1f; | ||
41 | in[2] = 0.4f; | ||
42 | in[3] = -0.7f; | ||
43 | in[5] = -1.2f; | ||
44 | in[6] = 0.2f; | ||
45 | in[7] = -0.8f; | ||
46 | |||
47 | volk_32f_binary_slicer_8i(out, in, N); | ||
48 | |||
49 | for(unsigned int ii = 0; ii < N; ++ii){ | ||
50 | printf("out(%i) = %i\n", ii, out[ii]); | ||
51 | } | ||
52 | |||
53 | volk_free(in); | ||
54 | volk_free(out); | ||
55 | |||
56 | * \endcode | ||
57 | */ | ||
58 | |||
59 | #ifndef INCLUDED_volk_32f_binary_slicer_8i_H | ||
60 | #define INCLUDED_volk_32f_binary_slicer_8i_H | ||
61 | |||
62 | |||
63 | #ifdef LV_HAVE_GENERIC | ||
64 | |||
65 | 2 | static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector, | |
66 | const float* aVector, | ||
67 | unsigned int num_points) | ||
68 | { | ||
69 | 2 | int8_t* cPtr = cVector; | |
70 | 2 | const float* aPtr = aVector; | |
71 | 2 | unsigned int number = 0; | |
72 | |||
73 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
74 |
2/2✓ Branch 0 taken 131279 times.
✓ Branch 1 taken 130863 times.
|
262142 | if (*aPtr++ >= 0) { |
75 | 131279 | *cPtr++ = 1; | |
76 | } else { | ||
77 | 130863 | *cPtr++ = 0; | |
78 | } | ||
79 | } | ||
80 | 2 | } | |
81 | #endif /* LV_HAVE_GENERIC */ | ||
82 | |||
83 | |||
84 | #ifdef LV_HAVE_GENERIC | ||
85 | |||
86 | 2 | static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, | |
87 | const float* aVector, | ||
88 | unsigned int num_points) | ||
89 | { | ||
90 | 2 | int8_t* cPtr = cVector; | |
91 | 2 | const float* aPtr = aVector; | |
92 | 2 | unsigned int number = 0; | |
93 | |||
94 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
95 | 262142 | *cPtr++ = (*aPtr++ >= 0); | |
96 | } | ||
97 | 2 | } | |
98 | #endif /* LV_HAVE_GENERIC */ | ||
99 | |||
100 | |||
101 | #ifdef LV_HAVE_AVX2 | ||
102 | #include <immintrin.h> | ||
103 | |||
104 | 2 | static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, | |
105 | const float* aVector, | ||
106 | unsigned int num_points) | ||
107 | { | ||
108 | 2 | int8_t* cPtr = cVector; | |
109 | 2 | const float* aPtr = aVector; | |
110 | 2 | unsigned int number = 0; | |
111 | 2 | unsigned int n32points = num_points / 32; | |
112 | |||
113 | 2 | const __m256 zero_val = _mm256_set1_ps(0.0f); | |
114 | __m256 a0_val, a1_val, a2_val, a3_val; | ||
115 | __m256 res0_f, res1_f, res2_f, res3_f; | ||
116 | __m256i res0_i, res1_i, res2_i, res3_i; | ||
117 | 2 | __m256i byte_shuffle = _mm256_set_epi8(15, | |
118 | 14, | ||
119 | 13, | ||
120 | 12, | ||
121 | 7, | ||
122 | 6, | ||
123 | 5, | ||
124 | 4, | ||
125 | 11, | ||
126 | 10, | ||
127 | 9, | ||
128 | 8, | ||
129 | 3, | ||
130 | 2, | ||
131 | 1, | ||
132 | 0, | ||
133 | 15, | ||
134 | 14, | ||
135 | 13, | ||
136 | 12, | ||
137 | 7, | ||
138 | 6, | ||
139 | 5, | ||
140 | 4, | ||
141 | 11, | ||
142 | 10, | ||
143 | 9, | ||
144 | 8, | ||
145 | 3, | ||
146 | 2, | ||
147 | 1, | ||
148 | 0); | ||
149 | |||
150 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < n32points; number++) { |
151 | 8190 | a0_val = _mm256_load_ps(aPtr); | |
152 | 8190 | a1_val = _mm256_load_ps(aPtr + 8); | |
153 | 8190 | a2_val = _mm256_load_ps(aPtr + 16); | |
154 | 8190 | a3_val = _mm256_load_ps(aPtr + 24); | |
155 | |||
156 | // compare >= 0; return float | ||
157 | 8190 | res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); | |
158 | 8190 | res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); | |
159 | 8190 | res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); | |
160 | 8190 | res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); | |
161 | |||
162 | // convert to 32i and >> 31 | ||
163 | 16380 | res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); | |
164 | 16380 | res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); | |
165 | 16380 | res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); | |
166 | 16380 | res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); | |
167 | |||
168 | // pack in to 16-bit results | ||
169 | 8190 | res0_i = _mm256_packs_epi32(res0_i, res1_i); | |
170 | 8190 | res2_i = _mm256_packs_epi32(res2_i, res3_i); | |
171 | // pack in to 8-bit results | ||
172 | // res0: (after packs_epi32) | ||
173 | // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 | ||
174 | // res2: | ||
175 | // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 | ||
176 | 8190 | res0_i = _mm256_packs_epi16(res0_i, res2_i); | |
177 | // shuffle the lanes | ||
178 | // res0: (after packs_epi16) | ||
179 | // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 | ||
180 | // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 | ||
181 | // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) | ||
182 | 8190 | res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); | |
183 | |||
184 | // shuffle bytes within lanes | ||
185 | // res0: (after shuffle_epi8) | ||
186 | // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 | ||
187 | // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 | ||
188 | 8190 | res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); | |
189 | |||
190 | _mm256_store_si256((__m256i*)cPtr, res0_i); | ||
191 | 8190 | aPtr += 32; | |
192 | 8190 | cPtr += 32; | |
193 | } | ||
194 | |||
195 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (number = n32points * 32; number < num_points; number++) { |
196 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 30 times.
|
62 | if (*aPtr++ >= 0) { |
197 | 32 | *cPtr++ = 1; | |
198 | } else { | ||
199 | 30 | *cPtr++ = 0; | |
200 | } | ||
201 | } | ||
202 | 2 | } | |
203 | #endif | ||
204 | |||
205 | #ifdef LV_HAVE_AVX2 | ||
206 | #include <immintrin.h> | ||
207 | |||
208 | 2 | static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, | |
209 | const float* aVector, | ||
210 | unsigned int num_points) | ||
211 | { | ||
212 | 2 | int8_t* cPtr = cVector; | |
213 | 2 | const float* aPtr = aVector; | |
214 | 2 | unsigned int number = 0; | |
215 | 2 | unsigned int n32points = num_points / 32; | |
216 | |||
217 | 2 | const __m256 zero_val = _mm256_set1_ps(0.0f); | |
218 | __m256 a0_val, a1_val, a2_val, a3_val; | ||
219 | __m256 res0_f, res1_f, res2_f, res3_f; | ||
220 | __m256i res0_i, res1_i, res2_i, res3_i; | ||
221 | 2 | __m256i byte_shuffle = _mm256_set_epi8(15, | |
222 | 14, | ||
223 | 13, | ||
224 | 12, | ||
225 | 7, | ||
226 | 6, | ||
227 | 5, | ||
228 | 4, | ||
229 | 11, | ||
230 | 10, | ||
231 | 9, | ||
232 | 8, | ||
233 | 3, | ||
234 | 2, | ||
235 | 1, | ||
236 | 0, | ||
237 | 15, | ||
238 | 14, | ||
239 | 13, | ||
240 | 12, | ||
241 | 7, | ||
242 | 6, | ||
243 | 5, | ||
244 | 4, | ||
245 | 11, | ||
246 | 10, | ||
247 | 9, | ||
248 | 8, | ||
249 | 3, | ||
250 | 2, | ||
251 | 1, | ||
252 | 0); | ||
253 | |||
254 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < n32points; number++) { |
255 | 8190 | a0_val = _mm256_loadu_ps(aPtr); | |
256 | 8190 | a1_val = _mm256_loadu_ps(aPtr + 8); | |
257 | 8190 | a2_val = _mm256_loadu_ps(aPtr + 16); | |
258 | 8190 | a3_val = _mm256_loadu_ps(aPtr + 24); | |
259 | |||
260 | // compare >= 0; return float | ||
261 | 8190 | res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); | |
262 | 8190 | res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); | |
263 | 8190 | res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); | |
264 | 8190 | res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); | |
265 | |||
266 | // convert to 32i and >> 31 | ||
267 | 16380 | res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); | |
268 | 16380 | res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); | |
269 | 16380 | res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); | |
270 | 16380 | res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); | |
271 | |||
272 | // pack in to 16-bit results | ||
273 | 8190 | res0_i = _mm256_packs_epi32(res0_i, res1_i); | |
274 | 8190 | res2_i = _mm256_packs_epi32(res2_i, res3_i); | |
275 | // pack in to 8-bit results | ||
276 | // res0: (after packs_epi32) | ||
277 | // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 | ||
278 | // res2: | ||
279 | // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 | ||
280 | 8190 | res0_i = _mm256_packs_epi16(res0_i, res2_i); | |
281 | // shuffle the lanes | ||
282 | // res0: (after packs_epi16) | ||
283 | // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 | ||
284 | // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 | ||
285 | // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) | ||
286 | 8190 | res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); | |
287 | |||
288 | // shuffle bytes within lanes | ||
289 | // res0: (after shuffle_epi8) | ||
290 | // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 | ||
291 | // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 | ||
292 | 8190 | res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); | |
293 | |||
294 | _mm256_storeu_si256((__m256i*)cPtr, res0_i); | ||
295 | 8190 | aPtr += 32; | |
296 | 8190 | cPtr += 32; | |
297 | } | ||
298 | |||
299 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (number = n32points * 32; number < num_points; number++) { |
300 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 30 times.
|
62 | if (*aPtr++ >= 0) { |
301 | 32 | *cPtr++ = 1; | |
302 | } else { | ||
303 | 30 | *cPtr++ = 0; | |
304 | } | ||
305 | } | ||
306 | 2 | } | |
307 | #endif | ||
308 | |||
309 | |||
310 | #ifdef LV_HAVE_SSE2 | ||
311 | |||
312 | #include <emmintrin.h> | ||
313 | |||
314 | 2 | static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, | |
315 | const float* aVector, | ||
316 | unsigned int num_points) | ||
317 | { | ||
318 | 2 | int8_t* cPtr = cVector; | |
319 | 2 | const float* aPtr = aVector; | |
320 | 2 | unsigned int number = 0; | |
321 | |||
322 | 2 | unsigned int n16points = num_points / 16; | |
323 | __m128 a0_val, a1_val, a2_val, a3_val; | ||
324 | __m128 res0_f, res1_f, res2_f, res3_f; | ||
325 | __m128i res0_i, res1_i, res2_i, res3_i; | ||
326 | __m128 zero_val; | ||
327 | 2 | zero_val = _mm_set1_ps(0.0f); | |
328 | |||
329 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < n16points; number++) { |
330 | 16382 | a0_val = _mm_load_ps(aPtr); | |
331 | 16382 | a1_val = _mm_load_ps(aPtr + 4); | |
332 | 16382 | a2_val = _mm_load_ps(aPtr + 8); | |
333 | 32764 | a3_val = _mm_load_ps(aPtr + 12); | |
334 | |||
335 | // compare >= 0; return float | ||
336 | 16382 | res0_f = _mm_cmpge_ps(a0_val, zero_val); | |
337 | 16382 | res1_f = _mm_cmpge_ps(a1_val, zero_val); | |
338 | 16382 | res2_f = _mm_cmpge_ps(a2_val, zero_val); | |
339 | 16382 | res3_f = _mm_cmpge_ps(a3_val, zero_val); | |
340 | |||
341 | // convert to 32i and >> 31 | ||
342 | 32764 | res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); | |
343 | 32764 | res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); | |
344 | 32764 | res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); | |
345 | 32764 | res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); | |
346 | |||
347 | // pack into 16-bit results | ||
348 | 16382 | res0_i = _mm_packs_epi32(res0_i, res1_i); | |
349 | 16382 | res2_i = _mm_packs_epi32(res2_i, res3_i); | |
350 | |||
351 | // pack into 8-bit results | ||
352 | 16382 | res0_i = _mm_packs_epi16(res0_i, res2_i); | |
353 | |||
354 | _mm_store_si128((__m128i*)cPtr, res0_i); | ||
355 | |||
356 | 16382 | cPtr += 16; | |
357 | 16382 | aPtr += 16; | |
358 | } | ||
359 | |||
360 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (number = n16points * 16; number < num_points; number++) { |
361 |
2/2✓ Branch 0 taken 19 times.
✓ Branch 1 taken 11 times.
|
30 | if (*aPtr++ >= 0) { |
362 | 19 | *cPtr++ = 1; | |
363 | } else { | ||
364 | 11 | *cPtr++ = 0; | |
365 | } | ||
366 | } | ||
367 | 2 | } | |
368 | #endif /* LV_HAVE_SSE2 */ | ||
369 | |||
370 | |||
371 | #ifdef LV_HAVE_SSE2 | ||
372 | #include <emmintrin.h> | ||
373 | |||
374 | 2 | static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, | |
375 | const float* aVector, | ||
376 | unsigned int num_points) | ||
377 | { | ||
378 | 2 | int8_t* cPtr = cVector; | |
379 | 2 | const float* aPtr = aVector; | |
380 | 2 | unsigned int number = 0; | |
381 | |||
382 | 2 | unsigned int n16points = num_points / 16; | |
383 | __m128 a0_val, a1_val, a2_val, a3_val; | ||
384 | __m128 res0_f, res1_f, res2_f, res3_f; | ||
385 | __m128i res0_i, res1_i, res2_i, res3_i; | ||
386 | __m128 zero_val; | ||
387 | 2 | zero_val = _mm_set1_ps(0.0f); | |
388 | |||
389 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < n16points; number++) { |
390 | 16382 | a0_val = _mm_loadu_ps(aPtr); | |
391 | 16382 | a1_val = _mm_loadu_ps(aPtr + 4); | |
392 | 16382 | a2_val = _mm_loadu_ps(aPtr + 8); | |
393 | 32764 | a3_val = _mm_loadu_ps(aPtr + 12); | |
394 | |||
395 | // compare >= 0; return float | ||
396 | 16382 | res0_f = _mm_cmpge_ps(a0_val, zero_val); | |
397 | 16382 | res1_f = _mm_cmpge_ps(a1_val, zero_val); | |
398 | 16382 | res2_f = _mm_cmpge_ps(a2_val, zero_val); | |
399 | 16382 | res3_f = _mm_cmpge_ps(a3_val, zero_val); | |
400 | |||
401 | // convert to 32i and >> 31 | ||
402 | 32764 | res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); | |
403 | 32764 | res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); | |
404 | 32764 | res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); | |
405 | 32764 | res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); | |
406 | |||
407 | // pack into 16-bit results | ||
408 | 16382 | res0_i = _mm_packs_epi32(res0_i, res1_i); | |
409 | 16382 | res2_i = _mm_packs_epi32(res2_i, res3_i); | |
410 | |||
411 | // pack into 8-bit results | ||
412 | 16382 | res0_i = _mm_packs_epi16(res0_i, res2_i); | |
413 | |||
414 | _mm_storeu_si128((__m128i*)cPtr, res0_i); | ||
415 | |||
416 | 16382 | cPtr += 16; | |
417 | 16382 | aPtr += 16; | |
418 | } | ||
419 | |||
420 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (number = n16points * 16; number < num_points; number++) { |
421 |
2/2✓ Branch 0 taken 19 times.
✓ Branch 1 taken 11 times.
|
30 | if (*aPtr++ >= 0) { |
422 | 19 | *cPtr++ = 1; | |
423 | } else { | ||
424 | 11 | *cPtr++ = 0; | |
425 | } | ||
426 | } | ||
427 | 2 | } | |
428 | #endif /* LV_HAVE_SSE2 */ | ||
429 | |||
430 | |||
431 | #ifdef LV_HAVE_NEON | ||
432 | #include <arm_neon.h> | ||
433 | |||
434 | static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector, | ||
435 | const float* aVector, | ||
436 | unsigned int num_points) | ||
437 | { | ||
438 | int8_t* cPtr = cVector; | ||
439 | const float* aPtr = aVector; | ||
440 | unsigned int number = 0; | ||
441 | unsigned int n16points = num_points / 16; | ||
442 | |||
443 | float32x4x2_t input_val0, input_val1; | ||
444 | float32x4_t zero_val; | ||
445 | uint32x4x2_t res0_u32, res1_u32; | ||
446 | uint16x4x2_t res0_u16x4, res1_u16x4; | ||
447 | uint16x8x2_t res_u16x8; | ||
448 | uint8x8x2_t res_u8; | ||
449 | uint8x8_t one; | ||
450 | |||
451 | zero_val = vdupq_n_f32(0.0); | ||
452 | one = vdup_n_u8(0x01); | ||
453 | |||
454 | // TODO: this is a good candidate for asm because the vcombines | ||
455 | // can be eliminated simply by picking dst registers that are | ||
456 | // adjacent. | ||
457 | for (number = 0; number < n16points; number++) { | ||
458 | input_val0 = vld2q_f32(aPtr); | ||
459 | input_val1 = vld2q_f32(aPtr + 8); | ||
460 | |||
461 | // test against 0; return uint32 | ||
462 | res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val); | ||
463 | res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val); | ||
464 | res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val); | ||
465 | res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val); | ||
466 | |||
467 | // narrow uint32 -> uint16 followed by combine to 8-element vectors | ||
468 | res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]); | ||
469 | res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]); | ||
470 | res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]); | ||
471 | res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]); | ||
472 | |||
473 | res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]); | ||
474 | res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]); | ||
475 | |||
476 | // narrow uint16x8 -> uint8x8 | ||
477 | res_u8.val[0] = vmovn_u16(res_u16x8.val[0]); | ||
478 | res_u8.val[1] = vmovn_u16(res_u16x8.val[1]); | ||
479 | // we *could* load twice as much data and do another vcombine here | ||
480 | // to get a uint8x16x2 vector, still only do 2 vandqs and a single store | ||
481 | // but that turns out to be ~16% slower than this version on zc702 | ||
482 | // it's possible register contention in GCC scheduler slows it down | ||
483 | // and a hand-written asm with quad-word u8 registers is much faster. | ||
484 | |||
485 | res_u8.val[0] = vand_u8(one, res_u8.val[0]); | ||
486 | res_u8.val[1] = vand_u8(one, res_u8.val[1]); | ||
487 | |||
488 | vst2_u8((unsigned char*)cPtr, res_u8); | ||
489 | cPtr += 16; | ||
490 | aPtr += 16; | ||
491 | } | ||
492 | |||
493 | for (number = n16points * 16; number < num_points; number++) { | ||
494 | if (*aPtr++ >= 0) { | ||
495 | *cPtr++ = 1; | ||
496 | } else { | ||
497 | *cPtr++ = 0; | ||
498 | } | ||
499 | } | ||
500 | } | ||
501 | #endif /* LV_HAVE_NEON */ | ||
502 | |||
503 | |||
504 | #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */ | ||
505 |