GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32u_byteswap.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 95 104 91.3%
Functions: 5 6 83.3%
Branches: 18 20 90.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32u_byteswap
12 *
13 * \b Overview
14 *
15 * Byteswaps (in-place) an aligned vector of int32_t's.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32u_byteswap(uint32_t* intsToSwap, unsigned int num_points)
20 * \endcode
21 *
22 * \b Inputs
23 * \li intsToSwap: The vector of data to byte swap.
24 * \li num_points: The number of data points.
25 *
26 * \b Outputs
27 * \li intsToSwap: returns as an in-place calculation.
28 *
29 * \b Example
30 * \code
31 * int N = 10;
32 * unsigned int alignment = volk_get_alignment();
33 *
34 * uint32_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffff,
35 * 0x5a5a5a5a, 0xa5a5a5a5, 0x2a2a2a2a,
36 * 0xffffffff, 0x32, 0x64};
37 * uint32_t hamming_distance = 0;
38 *
39 * printf("byteswap vector =\n");
40 * for(unsigned int ii=0; ii<N; ++ii){
41 * printf(" %.8x\n", bitstring[ii]);
42 * }
43 *
44 * volk_32u_byteswap(bitstring, N);
45 *
46 * printf("byteswapped vector =\n");
47 * for(unsigned int ii=0; ii<N; ++ii){
48 * printf(" %.8x\n", bitstring[ii]);
49 * }
50 * \endcode
51 */
52
53 #ifndef INCLUDED_volk_32u_byteswap_u_H
54 #define INCLUDED_volk_32u_byteswap_u_H
55
56 #include <inttypes.h>
57 #include <stdio.h>
58
59 #if LV_HAVE_AVX2
60 #include <immintrin.h>
61 2 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
62 {
63
64 unsigned int number;
65
66 2 const unsigned int nPerSet = 8;
67 2 const uint64_t nSets = num_points / nPerSet;
68
69 2 uint32_t* inputPtr = intsToSwap;
70
71 2 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
72 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
73 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
74
75 2 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
76
77
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < nSets; number++) {
78
79 // Load the 32t values, increment inputPtr later since we're doing it in-place.
80 32766 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
81 32766 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
82
83 // Store the results
84 _mm256_storeu_si256((__m256i*)inputPtr, output);
85 32766 inputPtr += nPerSet;
86 }
87
88 // Byteswap any remaining points:
89
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (number = nSets * nPerSet; number < num_points; number++) {
90 14 uint32_t outputVal = *inputPtr;
91 14 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
92 14 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
93 14 *inputPtr = outputVal;
94 14 inputPtr++;
95 }
96 2 }
97 #endif /* LV_HAVE_AVX2 */
98
99
100 #ifdef LV_HAVE_SSE2
101 #include <emmintrin.h>
102
103 2 static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
104 {
105 2 unsigned int number = 0;
106
107 2 uint32_t* inputPtr = intsToSwap;
108 __m128i input, byte1, byte2, byte3, byte4, output;
109 2 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
110 2 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
111
112 2 const uint64_t quarterPoints = num_points / 4;
113
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
114 // Load the 32t values, increment inputPtr later since we're doing it in-place.
115 65534 input = _mm_loadu_si128((__m128i*)inputPtr);
116 // Do the four shifts
117 65534 byte1 = _mm_slli_epi32(input, 24);
118 65534 byte2 = _mm_slli_epi32(input, 8);
119 65534 byte3 = _mm_srli_epi32(input, 8);
120 65534 byte4 = _mm_srli_epi32(input, 24);
121 // Or bytes together
122 65534 output = _mm_or_si128(byte1, byte4);
123 65534 byte2 = _mm_and_si128(byte2, byte2mask);
124 65534 output = _mm_or_si128(output, byte2);
125 65534 byte3 = _mm_and_si128(byte3, byte3mask);
126 65534 output = _mm_or_si128(output, byte3);
127 // Store the results
128 _mm_storeu_si128((__m128i*)inputPtr, output);
129 65534 inputPtr += 4;
130 }
131
132 // Byteswap any remaining points:
133 2 number = quarterPoints * 4;
134
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
135 6 uint32_t outputVal = *inputPtr;
136 6 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
137 6 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
138 6 *inputPtr = outputVal;
139 6 inputPtr++;
140 }
141 2 }
142 #endif /* LV_HAVE_SSE2 */
143
144
145 #ifdef LV_HAVE_NEON
146 #include <arm_neon.h>
147
148 static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
149 {
150 uint32_t* inputPtr = intsToSwap;
151 unsigned int number = 0;
152 unsigned int n8points = num_points / 8;
153
154 uint8x8x4_t input_table;
155 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
156 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
157
158 /* these magic numbers are used as byte-indices in the LUT.
159 they are pre-computed to save time. A simple C program
160 can calculate them; for example for lookup01:
161 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
162 for(ii=0; ii < 8; ++ii) {
163 index += ((uint64_t)(*(chars+ii))) << (ii*8);
164 }
165 */
166 int_lookup01 = vcreate_u8(74609667900706840);
167 int_lookup23 = vcreate_u8(219290013576860186);
168 int_lookup45 = vcreate_u8(363970359253013532);
169 int_lookup67 = vcreate_u8(508650704929166878);
170
171 for (number = 0; number < n8points; ++number) {
172 input_table = vld4_u8((uint8_t*)inputPtr);
173 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
174 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
175 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
176 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
177 vst1_u8((uint8_t*)inputPtr, swapped_int01);
178 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
179 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
180 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
181
182 inputPtr += 8;
183 }
184
185 for (number = n8points * 8; number < num_points; ++number) {
186 uint32_t output = *inputPtr;
187 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
188 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
189
190 *inputPtr = output;
191 inputPtr++;
192 }
193 }
194 #endif /* LV_HAVE_NEON */
195
196 #ifdef LV_HAVE_NEONV8
197 #include <arm_neon.h>
198
199 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
200 {
201 uint32_t* inputPtr = (uint32_t*)intsToSwap;
202 const unsigned int n8points = num_points / 8;
203 uint8x16_t input;
204 uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
205
206 unsigned int number = 0;
207 for (number = 0; number < n8points; ++number) {
208 __VOLK_PREFETCH(inputPtr + 8);
209 input = vld1q_u8((uint8_t*)inputPtr);
210 input = vqtbl1q_u8(input, idx);
211 vst1q_u8((uint8_t*)inputPtr, input);
212 inputPtr += 4;
213
214 input = vld1q_u8((uint8_t*)inputPtr);
215 input = vqtbl1q_u8(input, idx);
216 vst1q_u8((uint8_t*)inputPtr, input);
217 inputPtr += 4;
218 }
219
220 for (number = n8points * 8; number < num_points; ++number) {
221 uint32_t output = *inputPtr;
222
223 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
224 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
225
226 *inputPtr++ = output;
227 }
228 }
229 #endif /* LV_HAVE_NEONV8 */
230
231
232 #ifdef LV_HAVE_GENERIC
233
234 2 static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
235 unsigned int num_points)
236 {
237 2 uint32_t* inputPtr = intsToSwap;
238
239 unsigned int point;
240
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (point = 0; point < num_points; point++) {
241 262142 uint32_t output = *inputPtr;
242 262142 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
243 262142 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
244
245 262142 *inputPtr = output;
246 262142 inputPtr++;
247 }
248 2 }
249 #endif /* LV_HAVE_GENERIC */
250
251
252 #endif /* INCLUDED_volk_32u_byteswap_u_H */
253 #ifndef INCLUDED_volk_32u_byteswap_a_H
254 #define INCLUDED_volk_32u_byteswap_a_H
255
256 #include <inttypes.h>
257 #include <stdio.h>
258
259
260 #if LV_HAVE_AVX2
261 #include <immintrin.h>
262 2 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
263 {
264
265 unsigned int number;
266
267 2 const unsigned int nPerSet = 8;
268 2 const uint64_t nSets = num_points / nPerSet;
269
270 2 uint32_t* inputPtr = intsToSwap;
271
272 2 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
273 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
274 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
275
276 2 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
277
278
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < nSets; number++) {
279
280 // Load the 32t values, increment inputPtr later since we're doing it in-place.
281 32766 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
282 32766 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
283
284 // Store the results
285 _mm256_store_si256((__m256i*)inputPtr, output);
286 32766 inputPtr += nPerSet;
287 }
288
289 // Byteswap any remaining points:
290
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (number = nSets * nPerSet; number < num_points; number++) {
291 14 uint32_t outputVal = *inputPtr;
292 14 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
293 14 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
294 14 *inputPtr = outputVal;
295 14 inputPtr++;
296 }
297 2 }
298 #endif /* LV_HAVE_AVX2 */
299
300
301 #ifdef LV_HAVE_SSE2
302 #include <emmintrin.h>
303
304
305 2 static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
306 {
307 2 unsigned int number = 0;
308
309 2 uint32_t* inputPtr = intsToSwap;
310 __m128i input, byte1, byte2, byte3, byte4, output;
311 2 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
312 2 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
313
314 2 const uint64_t quarterPoints = num_points / 4;
315
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
316 // Load the 32t values, increment inputPtr later since we're doing it in-place.
317 65534 input = _mm_load_si128((__m128i*)inputPtr);
318 // Do the four shifts
319 65534 byte1 = _mm_slli_epi32(input, 24);
320 65534 byte2 = _mm_slli_epi32(input, 8);
321 65534 byte3 = _mm_srli_epi32(input, 8);
322 65534 byte4 = _mm_srli_epi32(input, 24);
323 // Or bytes together
324 65534 output = _mm_or_si128(byte1, byte4);
325 65534 byte2 = _mm_and_si128(byte2, byte2mask);
326 65534 output = _mm_or_si128(output, byte2);
327 65534 byte3 = _mm_and_si128(byte3, byte3mask);
328 65534 output = _mm_or_si128(output, byte3);
329 // Store the results
330 _mm_store_si128((__m128i*)inputPtr, output);
331 65534 inputPtr += 4;
332 }
333
334 // Byteswap any remaining points:
335 2 number = quarterPoints * 4;
336
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
337 6 uint32_t outputVal = *inputPtr;
338 6 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
339 6 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
340 6 *inputPtr = outputVal;
341 6 inputPtr++;
342 }
343 2 }
344 #endif /* LV_HAVE_SSE2 */
345
346
347 #ifdef LV_HAVE_GENERIC
348
349 static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
350 unsigned int num_points)
351 {
352 uint32_t* inputPtr = intsToSwap;
353
354 unsigned int point;
355 for (point = 0; point < num_points; point++) {
356 uint32_t output = *inputPtr;
357 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
358 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
359
360 *inputPtr = output;
361 inputPtr++;
362 }
363 }
364 #endif /* LV_HAVE_GENERIC */
365
366
367 #endif /* INCLUDED_volk_32u_byteswap_a_H */
368