GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_binary_slicer_8i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 138 138 100.0%
Functions: 6 6 100.0%
Branches: 30 30 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_binary_slicer_8i
12 *
13 * \b Overview
14 *
15 * Slices input floats and and returns 1 when the input >= 0 and 0
16 * when < 0. Results are converted to 8-bit chars.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int
21 num_points)
22 * \endcode
23 *
24 * \b Inputs
25 * \li aVector: The input vector of floats.
26 * \li num_points: The number of data points.
27 *
28 * \b Outputs
29 * \li cVector: The output vector of 8-bit chars.
30 *
31 * \b Example
32 * Generate bytes of a 7-bit barker code from floats.
33 * \code
34 int N = 7;
35 unsigned int alignment = volk_get_alignment();
36 float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
37 int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
38
39 in[0] = 0.9f;
40 in[1] = 1.1f;
41 in[2] = 0.4f;
42 in[3] = -0.7f;
43 in[5] = -1.2f;
44 in[6] = 0.2f;
45 in[7] = -0.8f;
46
47 volk_32f_binary_slicer_8i(out, in, N);
48
49 for(unsigned int ii = 0; ii < N; ++ii){
50 printf("out(%i) = %i\n", ii, out[ii]);
51 }
52
53 volk_free(in);
54 volk_free(out);
55
56 * \endcode
57 */
58
59 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
60 #define INCLUDED_volk_32f_binary_slicer_8i_H
61
62
63 #ifdef LV_HAVE_GENERIC
64
65 2 static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
66 const float* aVector,
67 unsigned int num_points)
68 {
69 2 int8_t* cPtr = cVector;
70 2 const float* aPtr = aVector;
71 2 unsigned int number = 0;
72
73
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
74
2/2
✓ Branch 0 taken 131279 times.
✓ Branch 1 taken 130863 times.
262142 if (*aPtr++ >= 0) {
75 131279 *cPtr++ = 1;
76 } else {
77 130863 *cPtr++ = 0;
78 }
79 }
80 2 }
81 #endif /* LV_HAVE_GENERIC */
82
83
84 #ifdef LV_HAVE_GENERIC
85
86 2 static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
87 const float* aVector,
88 unsigned int num_points)
89 {
90 2 int8_t* cPtr = cVector;
91 2 const float* aPtr = aVector;
92 2 unsigned int number = 0;
93
94
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
95 262142 *cPtr++ = (*aPtr++ >= 0);
96 }
97 2 }
98 #endif /* LV_HAVE_GENERIC */
99
100
101 #ifdef LV_HAVE_AVX2
102 #include <immintrin.h>
103
104 2 static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
105 const float* aVector,
106 unsigned int num_points)
107 {
108 2 int8_t* cPtr = cVector;
109 2 const float* aPtr = aVector;
110 2 unsigned int number = 0;
111 2 unsigned int n32points = num_points / 32;
112
113 2 const __m256 zero_val = _mm256_set1_ps(0.0f);
114 __m256 a0_val, a1_val, a2_val, a3_val;
115 __m256 res0_f, res1_f, res2_f, res3_f;
116 __m256i res0_i, res1_i, res2_i, res3_i;
117 2 __m256i byte_shuffle = _mm256_set_epi8(15,
118 14,
119 13,
120 12,
121 7,
122 6,
123 5,
124 4,
125 11,
126 10,
127 9,
128 8,
129 3,
130 2,
131 1,
132 0,
133 15,
134 14,
135 13,
136 12,
137 7,
138 6,
139 5,
140 4,
141 11,
142 10,
143 9,
144 8,
145 3,
146 2,
147 1,
148 0);
149
150
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (number = 0; number < n32points; number++) {
151 8190 a0_val = _mm256_load_ps(aPtr);
152 8190 a1_val = _mm256_load_ps(aPtr + 8);
153 8190 a2_val = _mm256_load_ps(aPtr + 16);
154 8190 a3_val = _mm256_load_ps(aPtr + 24);
155
156 // compare >= 0; return float
157 8190 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
158 8190 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
159 8190 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
160 8190 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
161
162 // convert to 32i and >> 31
163 16380 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
164 16380 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
165 16380 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
166 16380 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
167
168 // pack in to 16-bit results
169 8190 res0_i = _mm256_packs_epi32(res0_i, res1_i);
170 8190 res2_i = _mm256_packs_epi32(res2_i, res3_i);
171 // pack in to 8-bit results
172 // res0: (after packs_epi32)
173 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
174 // res2:
175 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
176 8190 res0_i = _mm256_packs_epi16(res0_i, res2_i);
177 // shuffle the lanes
178 // res0: (after packs_epi16)
179 // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
180 // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
181 // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
182 8190 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
183
184 // shuffle bytes within lanes
185 // res0: (after shuffle_epi8)
186 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
187 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
188 8190 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
189
190 _mm256_store_si256((__m256i*)cPtr, res0_i);
191 8190 aPtr += 32;
192 8190 cPtr += 32;
193 }
194
195
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (number = n32points * 32; number < num_points; number++) {
196
2/2
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 30 times.
62 if (*aPtr++ >= 0) {
197 32 *cPtr++ = 1;
198 } else {
199 30 *cPtr++ = 0;
200 }
201 }
202 2 }
203 #endif
204
205 #ifdef LV_HAVE_AVX2
206 #include <immintrin.h>
207
208 2 static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
209 const float* aVector,
210 unsigned int num_points)
211 {
212 2 int8_t* cPtr = cVector;
213 2 const float* aPtr = aVector;
214 2 unsigned int number = 0;
215 2 unsigned int n32points = num_points / 32;
216
217 2 const __m256 zero_val = _mm256_set1_ps(0.0f);
218 __m256 a0_val, a1_val, a2_val, a3_val;
219 __m256 res0_f, res1_f, res2_f, res3_f;
220 __m256i res0_i, res1_i, res2_i, res3_i;
221 2 __m256i byte_shuffle = _mm256_set_epi8(15,
222 14,
223 13,
224 12,
225 7,
226 6,
227 5,
228 4,
229 11,
230 10,
231 9,
232 8,
233 3,
234 2,
235 1,
236 0,
237 15,
238 14,
239 13,
240 12,
241 7,
242 6,
243 5,
244 4,
245 11,
246 10,
247 9,
248 8,
249 3,
250 2,
251 1,
252 0);
253
254
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (number = 0; number < n32points; number++) {
255 8190 a0_val = _mm256_loadu_ps(aPtr);
256 8190 a1_val = _mm256_loadu_ps(aPtr + 8);
257 8190 a2_val = _mm256_loadu_ps(aPtr + 16);
258 8190 a3_val = _mm256_loadu_ps(aPtr + 24);
259
260 // compare >= 0; return float
261 8190 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
262 8190 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
263 8190 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
264 8190 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
265
266 // convert to 32i and >> 31
267 16380 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
268 16380 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
269 16380 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
270 16380 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
271
272 // pack in to 16-bit results
273 8190 res0_i = _mm256_packs_epi32(res0_i, res1_i);
274 8190 res2_i = _mm256_packs_epi32(res2_i, res3_i);
275 // pack in to 8-bit results
276 // res0: (after packs_epi32)
277 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
278 // res2:
279 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
280 8190 res0_i = _mm256_packs_epi16(res0_i, res2_i);
281 // shuffle the lanes
282 // res0: (after packs_epi16)
283 // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
284 // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
285 // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
286 8190 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
287
288 // shuffle bytes within lanes
289 // res0: (after shuffle_epi8)
290 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
291 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
292 8190 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
293
294 _mm256_storeu_si256((__m256i*)cPtr, res0_i);
295 8190 aPtr += 32;
296 8190 cPtr += 32;
297 }
298
299
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (number = n32points * 32; number < num_points; number++) {
300
2/2
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 30 times.
62 if (*aPtr++ >= 0) {
301 32 *cPtr++ = 1;
302 } else {
303 30 *cPtr++ = 0;
304 }
305 }
306 2 }
307 #endif
308
309
310 #ifdef LV_HAVE_SSE2
311
312 #include <emmintrin.h>
313
314 2 static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
315 const float* aVector,
316 unsigned int num_points)
317 {
318 2 int8_t* cPtr = cVector;
319 2 const float* aPtr = aVector;
320 2 unsigned int number = 0;
321
322 2 unsigned int n16points = num_points / 16;
323 __m128 a0_val, a1_val, a2_val, a3_val;
324 __m128 res0_f, res1_f, res2_f, res3_f;
325 __m128i res0_i, res1_i, res2_i, res3_i;
326 __m128 zero_val;
327 2 zero_val = _mm_set1_ps(0.0f);
328
329
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < n16points; number++) {
330 16382 a0_val = _mm_load_ps(aPtr);
331 16382 a1_val = _mm_load_ps(aPtr + 4);
332 16382 a2_val = _mm_load_ps(aPtr + 8);
333 32764 a3_val = _mm_load_ps(aPtr + 12);
334
335 // compare >= 0; return float
336 16382 res0_f = _mm_cmpge_ps(a0_val, zero_val);
337 16382 res1_f = _mm_cmpge_ps(a1_val, zero_val);
338 16382 res2_f = _mm_cmpge_ps(a2_val, zero_val);
339 16382 res3_f = _mm_cmpge_ps(a3_val, zero_val);
340
341 // convert to 32i and >> 31
342 32764 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
343 32764 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
344 32764 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
345 32764 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
346
347 // pack into 16-bit results
348 16382 res0_i = _mm_packs_epi32(res0_i, res1_i);
349 16382 res2_i = _mm_packs_epi32(res2_i, res3_i);
350
351 // pack into 8-bit results
352 16382 res0_i = _mm_packs_epi16(res0_i, res2_i);
353
354 _mm_store_si128((__m128i*)cPtr, res0_i);
355
356 16382 cPtr += 16;
357 16382 aPtr += 16;
358 }
359
360
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (number = n16points * 16; number < num_points; number++) {
361
2/2
✓ Branch 0 taken 19 times.
✓ Branch 1 taken 11 times.
30 if (*aPtr++ >= 0) {
362 19 *cPtr++ = 1;
363 } else {
364 11 *cPtr++ = 0;
365 }
366 }
367 2 }
368 #endif /* LV_HAVE_SSE2 */
369
370
371 #ifdef LV_HAVE_SSE2
372 #include <emmintrin.h>
373
374 2 static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
375 const float* aVector,
376 unsigned int num_points)
377 {
378 2 int8_t* cPtr = cVector;
379 2 const float* aPtr = aVector;
380 2 unsigned int number = 0;
381
382 2 unsigned int n16points = num_points / 16;
383 __m128 a0_val, a1_val, a2_val, a3_val;
384 __m128 res0_f, res1_f, res2_f, res3_f;
385 __m128i res0_i, res1_i, res2_i, res3_i;
386 __m128 zero_val;
387 2 zero_val = _mm_set1_ps(0.0f);
388
389
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < n16points; number++) {
390 16382 a0_val = _mm_loadu_ps(aPtr);
391 16382 a1_val = _mm_loadu_ps(aPtr + 4);
392 16382 a2_val = _mm_loadu_ps(aPtr + 8);
393 32764 a3_val = _mm_loadu_ps(aPtr + 12);
394
395 // compare >= 0; return float
396 16382 res0_f = _mm_cmpge_ps(a0_val, zero_val);
397 16382 res1_f = _mm_cmpge_ps(a1_val, zero_val);
398 16382 res2_f = _mm_cmpge_ps(a2_val, zero_val);
399 16382 res3_f = _mm_cmpge_ps(a3_val, zero_val);
400
401 // convert to 32i and >> 31
402 32764 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
403 32764 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
404 32764 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
405 32764 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
406
407 // pack into 16-bit results
408 16382 res0_i = _mm_packs_epi32(res0_i, res1_i);
409 16382 res2_i = _mm_packs_epi32(res2_i, res3_i);
410
411 // pack into 8-bit results
412 16382 res0_i = _mm_packs_epi16(res0_i, res2_i);
413
414 _mm_storeu_si128((__m128i*)cPtr, res0_i);
415
416 16382 cPtr += 16;
417 16382 aPtr += 16;
418 }
419
420
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (number = n16points * 16; number < num_points; number++) {
421
2/2
✓ Branch 0 taken 19 times.
✓ Branch 1 taken 11 times.
30 if (*aPtr++ >= 0) {
422 19 *cPtr++ = 1;
423 } else {
424 11 *cPtr++ = 0;
425 }
426 }
427 2 }
428 #endif /* LV_HAVE_SSE2 */
429
430
431 #ifdef LV_HAVE_NEON
432 #include <arm_neon.h>
433
434 static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
435 const float* aVector,
436 unsigned int num_points)
437 {
438 int8_t* cPtr = cVector;
439 const float* aPtr = aVector;
440 unsigned int number = 0;
441 unsigned int n16points = num_points / 16;
442
443 float32x4x2_t input_val0, input_val1;
444 float32x4_t zero_val;
445 uint32x4x2_t res0_u32, res1_u32;
446 uint16x4x2_t res0_u16x4, res1_u16x4;
447 uint16x8x2_t res_u16x8;
448 uint8x8x2_t res_u8;
449 uint8x8_t one;
450
451 zero_val = vdupq_n_f32(0.0);
452 one = vdup_n_u8(0x01);
453
454 // TODO: this is a good candidate for asm because the vcombines
455 // can be eliminated simply by picking dst registers that are
456 // adjacent.
457 for (number = 0; number < n16points; number++) {
458 input_val0 = vld2q_f32(aPtr);
459 input_val1 = vld2q_f32(aPtr + 8);
460
461 // test against 0; return uint32
462 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
463 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
464 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
465 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
466
467 // narrow uint32 -> uint16 followed by combine to 8-element vectors
468 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
469 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
470 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
471 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
472
473 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
474 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
475
476 // narrow uint16x8 -> uint8x8
477 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
478 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
479 // we *could* load twice as much data and do another vcombine here
480 // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
481 // but that turns out to be ~16% slower than this version on zc702
482 // it's possible register contention in GCC scheduler slows it down
483 // and a hand-written asm with quad-word u8 registers is much faster.
484
485 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
486 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
487
488 vst2_u8((unsigned char*)cPtr, res_u8);
489 cPtr += 16;
490 aPtr += 16;
491 }
492
493 for (number = n16points * 16; number < num_points; number++) {
494 if (*aPtr++ >= 0) {
495 *cPtr++ = 1;
496 } else {
497 *cPtr++ = 0;
498 }
499 }
500 }
501 #endif /* LV_HAVE_NEON */
502
503
504 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
505