GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_64u_byteswap.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 164 176 93.2%
Functions: 7 8 87.5%
Branches: 26 28 92.9%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_64u_byteswap
12 *
13 * \b Overview
14 *
15 * Byteswaps (in-place) an aligned vector of int64_t's.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_64u_byteswap(uint64_t* intsToSwap, unsigned int num_points)
20 * \endcode
21 *
22 * \b Inputs
23 * \li intsToSwap: The vector of data to byte swap
24 * \li num_points: The number of data points
25 *
26 * \b Outputs
27 * \li intsToSwap: returns as an in-place calculation.
28 *
29 * \b Example
30 * \code
31 * int N = 10;
32 * unsigned int alignment = volk_get_alignment();
33 *
34 * uint64_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffffffffffff,
35 * 0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5, 0x2a2a2a2a2a2a2a2a,
36 * 0xffffffff, 0x32, 0x64};
37 * uint64_t hamming_distance = 0;
38 *
39 * printf("byteswap vector =\n");
40 * for(unsigned int ii=0; ii<N; ++ii){
41 * printf(" %.16lx\n", bitstring[ii]);
42 * }
43 *
44 * volk_64u_byteswap(bitstring, N);
45 *
46 * printf("byteswapped vector =\n");
47 * for(unsigned int ii=0; ii<N; ++ii){
48 * printf(" %.16lx\n", bitstring[ii]);
49 * }
50 * \endcode
51 */
52
53 #ifndef INCLUDED_volk_64u_byteswap_u_H
54 #define INCLUDED_volk_64u_byteswap_u_H
55
56 #include <inttypes.h>
57 #include <stdio.h>
58
59 #ifdef LV_HAVE_SSE2
60 #include <emmintrin.h>
61
62 2 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
63 {
64 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
65 __m128i input, byte1, byte2, byte3, byte4, output;
66 2 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67 2 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
68 2 uint64_t number = 0;
69 2 const unsigned int halfPoints = num_points / 2;
70
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
71 // Load the 32t values, increment inputPtr later since we're doing it in-place.
72 131070 input = _mm_loadu_si128((__m128i*)inputPtr);
73
74 // Do the four shifts
75 131070 byte1 = _mm_slli_epi32(input, 24);
76 131070 byte2 = _mm_slli_epi32(input, 8);
77 131070 byte3 = _mm_srli_epi32(input, 8);
78 131070 byte4 = _mm_srli_epi32(input, 24);
79 // Or bytes together
80 131070 output = _mm_or_si128(byte1, byte4);
81 131070 byte2 = _mm_and_si128(byte2, byte2mask);
82 131070 output = _mm_or_si128(output, byte2);
83 131070 byte3 = _mm_and_si128(byte3, byte3mask);
84 131070 output = _mm_or_si128(output, byte3);
85
86 // Reorder the two words
87 131070 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Store the results
90 _mm_storeu_si128((__m128i*)inputPtr, output);
91 131070 inputPtr += 4;
92 }
93
94 // Byteswap any remaining points:
95 2 number = halfPoints * 2;
96
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; number < num_points; number++) {
97 2 uint32_t output1 = *inputPtr;
98 2 uint32_t output2 = inputPtr[1];
99
100 2 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101 2 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
102
103 2 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104 2 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
105
106 2 *inputPtr++ = output2;
107 2 *inputPtr++ = output1;
108 }
109 2 }
110 #endif /* LV_HAVE_SSE2 */
111
112
113 #ifdef LV_HAVE_GENERIC
114
115 2 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
116 unsigned int num_points)
117 {
118 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
119 unsigned int point;
120
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (point = 0; point < num_points; point++) {
121 262142 uint32_t output1 = *inputPtr;
122 262142 uint32_t output2 = inputPtr[1];
123
124 262142 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
125 262142 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
126
127 262142 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
128 262142 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
129
130 262142 *inputPtr++ = output2;
131 262142 *inputPtr++ = output1;
132 }
133 2 }
134 #endif /* LV_HAVE_GENERIC */
135
136 #if LV_HAVE_AVX2
137 #include <immintrin.h>
138 2 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
139 {
140 2 unsigned int number = 0;
141
142 2 const unsigned int nPerSet = 4;
143 2 const uint64_t nSets = num_points / nPerSet;
144
145 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
146
147 2 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
148 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
149 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
150
151 2 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
152
153
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < nSets; number++) {
154
155 // Load the 32t values, increment inputPtr later since we're doing it in-place.
156 65534 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
157 65534 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
158
159 // Store the results
160 _mm256_store_si256((__m256i*)inputPtr, output);
161
162 /* inputPtr is 32bit so increment twice */
163 65534 inputPtr += 2 * nPerSet;
164 }
165
166 // Byteswap any remaining points:
167
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (number = nSets * nPerSet; number < num_points; ++number) {
168 6 uint32_t output1 = *inputPtr;
169 6 uint32_t output2 = inputPtr[1];
170 6 uint32_t out1 =
171 6 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
172 6 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
173
174 6 uint32_t out2 =
175 6 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
176 6 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
177 6 *inputPtr++ = out2;
178 6 *inputPtr++ = out1;
179 }
180 2 }
181
182 #endif /* LV_HAVE_AVX2 */
183
184
185 #if LV_HAVE_SSSE3
186 #include <tmmintrin.h>
187 2 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
188 unsigned int num_points)
189 {
190 2 unsigned int number = 0;
191
192 2 const unsigned int nPerSet = 2;
193 2 const uint64_t nSets = num_points / nPerSet;
194
195 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
196
197 2 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
198
199 2 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
200
201
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < nSets; number++) {
202
203 // Load the 32t values, increment inputPtr later since we're doing it in-place.
204 131070 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
205 131070 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
206
207 // Store the results
208 _mm_store_si128((__m128i*)inputPtr, output);
209
210 /* inputPtr is 32bit so increment twice */
211 131070 inputPtr += 2 * nPerSet;
212 }
213
214 // Byteswap any remaining points:
215
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (number = nSets * nPerSet; number < num_points; ++number) {
216 2 uint32_t output1 = *inputPtr;
217 2 uint32_t output2 = inputPtr[1];
218 2 uint32_t out1 =
219 2 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
220 2 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
221
222 2 uint32_t out2 =
223 2 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
224 2 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
225 2 *inputPtr++ = out2;
226 2 *inputPtr++ = out1;
227 }
228 2 }
229 #endif /* LV_HAVE_SSSE3 */
230
231
232 #ifdef LV_HAVE_NEONV8
233 #include <arm_neon.h>
234
235 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
236 {
237 uint32_t* inputPtr = (uint32_t*)intsToSwap;
238 const unsigned int n4points = num_points / 4;
239 uint8x16x2_t input;
240 uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
241
242 unsigned int number = 0;
243 for (number = 0; number < n4points; ++number) {
244 __VOLK_PREFETCH(inputPtr + 8);
245 input = vld2q_u8((uint8_t*)inputPtr);
246 input.val[0] = vqtbl1q_u8(input.val[0], idx);
247 input.val[1] = vqtbl1q_u8(input.val[1], idx);
248 vst2q_u8((uint8_t*)inputPtr, input);
249
250 inputPtr += 8;
251 }
252
253 for (number = n4points * 4; number < num_points; ++number) {
254 uint32_t output1 = *inputPtr;
255 uint32_t output2 = inputPtr[1];
256
257 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
258 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
259 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
260 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
261
262 *inputPtr++ = output2;
263 *inputPtr++ = output1;
264 }
265 }
266 #else
267 #ifdef LV_HAVE_NEON
268 #include <arm_neon.h>
269
270 static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
271 {
272 uint32_t* inputPtr = (uint32_t*)intsToSwap;
273 unsigned int number = 0;
274 unsigned int n8points = num_points / 4;
275
276 uint8x8x4_t input_table;
277 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
278 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
279
280 /* these magic numbers are used as byte-indices in the LUT.
281 they are pre-computed to save time. A simple C program
282 can calculate them; for example for lookup01:
283 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
284 for(ii=0; ii < 8; ++ii) {
285 index += ((uint64_t)(*(chars+ii))) << (ii*8);
286 }
287 */
288 int_lookup01 = vcreate_u8(2269495096316185);
289 int_lookup23 = vcreate_u8(146949840772469531);
290 int_lookup45 = vcreate_u8(291630186448622877);
291 int_lookup67 = vcreate_u8(436310532124776223);
292
293 for (number = 0; number < n8points; ++number) {
294 input_table = vld4_u8((uint8_t*)inputPtr);
295 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
296 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
297 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
298 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
299 vst1_u8((uint8_t*)inputPtr, swapped_int01);
300 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
301 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
302 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
303
304 inputPtr += 4;
305 }
306
307 for (number = n8points * 4; number < num_points; ++number) {
308 uint32_t output1 = *inputPtr;
309 uint32_t output2 = inputPtr[1];
310
311 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
312 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
313 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
314 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
315
316 *inputPtr++ = output2;
317 *inputPtr++ = output1;
318 }
319 }
320 #endif /* LV_HAVE_NEON */
321 #endif
322
323 #endif /* INCLUDED_volk_64u_byteswap_u_H */
324 #ifndef INCLUDED_volk_64u_byteswap_a_H
325 #define INCLUDED_volk_64u_byteswap_a_H
326
327 #include <inttypes.h>
328 #include <stdio.h>
329
330
331 #ifdef LV_HAVE_SSE2
332 #include <emmintrin.h>
333
334 2 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
335 {
336 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
337 __m128i input, byte1, byte2, byte3, byte4, output;
338 2 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
339 2 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
340 2 uint64_t number = 0;
341 2 const unsigned int halfPoints = num_points / 2;
342
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
343 // Load the 32t values, increment inputPtr later since we're doing it in-place.
344 131070 input = _mm_load_si128((__m128i*)inputPtr);
345
346 // Do the four shifts
347 131070 byte1 = _mm_slli_epi32(input, 24);
348 131070 byte2 = _mm_slli_epi32(input, 8);
349 131070 byte3 = _mm_srli_epi32(input, 8);
350 131070 byte4 = _mm_srli_epi32(input, 24);
351 // Or bytes together
352 131070 output = _mm_or_si128(byte1, byte4);
353 131070 byte2 = _mm_and_si128(byte2, byte2mask);
354 131070 output = _mm_or_si128(output, byte2);
355 131070 byte3 = _mm_and_si128(byte3, byte3mask);
356 131070 output = _mm_or_si128(output, byte3);
357
358 // Reorder the two words
359 131070 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
360
361 // Store the results
362 _mm_store_si128((__m128i*)inputPtr, output);
363 131070 inputPtr += 4;
364 }
365
366 // Byteswap any remaining points:
367 2 number = halfPoints * 2;
368
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; number < num_points; number++) {
369 2 uint32_t output1 = *inputPtr;
370 2 uint32_t output2 = inputPtr[1];
371
372 2 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
373 2 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
374
375 2 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
376 2 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
377
378 2 *inputPtr++ = output2;
379 2 *inputPtr++ = output1;
380 }
381 2 }
382 #endif /* LV_HAVE_SSE2 */
383
384 #if LV_HAVE_AVX2
385 #include <immintrin.h>
386 2 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
387 {
388 2 unsigned int number = 0;
389
390 2 const unsigned int nPerSet = 4;
391 2 const uint64_t nSets = num_points / nPerSet;
392
393 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
394
395 2 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
396 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
397 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
398
399 2 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
400
401
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < nSets; number++) {
402 // Load the 32t values, increment inputPtr later since we're doing it in-place.
403 65534 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
404 65534 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
405
406 // Store the results
407 _mm256_storeu_si256((__m256i*)inputPtr, output);
408
409 /* inputPtr is 32bit so increment twice */
410 65534 inputPtr += 2 * nPerSet;
411 }
412
413 // Byteswap any remaining points:
414
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (number = nSets * nPerSet; number < num_points; ++number) {
415 6 uint32_t output1 = *inputPtr;
416 6 uint32_t output2 = inputPtr[1];
417 6 uint32_t out1 =
418 6 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
419 6 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
420
421 6 uint32_t out2 =
422 6 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
423 6 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
424 6 *inputPtr++ = out2;
425 6 *inputPtr++ = out1;
426 }
427 2 }
428
429 #endif /* LV_HAVE_AVX2 */
430
431
432 #if LV_HAVE_SSSE3
433 #include <tmmintrin.h>
434 2 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
435 unsigned int num_points)
436 {
437 2 unsigned int number = 0;
438
439 2 const unsigned int nPerSet = 2;
440 2 const uint64_t nSets = num_points / nPerSet;
441
442 2 uint32_t* inputPtr = (uint32_t*)intsToSwap;
443
444 2 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
445
446 2 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
447
448
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < nSets; number++) {
449 // Load the 32t values, increment inputPtr later since we're doing it in-place.
450 131070 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
451 131070 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
452
453 // Store the results
454 _mm_storeu_si128((__m128i*)inputPtr, output);
455
456 /* inputPtr is 32bit so increment twice */
457 131070 inputPtr += 2 * nPerSet;
458 }
459
460 // Byteswap any remaining points:
461
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (number = nSets * nPerSet; number < num_points; ++number) {
462 2 uint32_t output1 = *inputPtr;
463 2 uint32_t output2 = inputPtr[1];
464 2 uint32_t out1 =
465 2 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
466 2 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
467
468 2 uint32_t out2 =
469 2 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
470 2 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
471 2 *inputPtr++ = out2;
472 2 *inputPtr++ = out1;
473 }
474 2 }
475 #endif /* LV_HAVE_SSSE3 */
476
477 #ifdef LV_HAVE_GENERIC
478
479 static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
480 unsigned int num_points)
481 {
482 uint32_t* inputPtr = (uint32_t*)intsToSwap;
483 unsigned int point;
484 for (point = 0; point < num_points; point++) {
485 uint32_t output1 = *inputPtr;
486 uint32_t output2 = inputPtr[1];
487
488 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
489 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
490
491 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
492 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
493
494 *inputPtr++ = output2;
495 *inputPtr++ = output1;
496 }
497 }
498 #endif /* LV_HAVE_GENERIC */
499
500
501 #endif /* INCLUDED_volk_64u_byteswap_a_H */
502