Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_64u_byteswap | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Byteswaps (in-place) an aligned vector of int64_t's. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_64u_byteswap(uint64_t* intsToSwap, unsigned int num_points) | ||
20 | * \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li intsToSwap: The vector of data to byte swap | ||
24 | * \li num_points: The number of data points | ||
25 | * | ||
26 | * \b Outputs | ||
27 | * \li intsToSwap: returns as an in-place calculation. | ||
28 | * | ||
29 | * \b Example | ||
30 | * \code | ||
31 | * int N = 10; | ||
32 | * unsigned int alignment = volk_get_alignment(); | ||
33 | * | ||
34 | * uint64_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffffffffffff, | ||
35 | * 0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5, 0x2a2a2a2a2a2a2a2a, | ||
36 | * 0xffffffff, 0x32, 0x64}; | ||
37 | * uint64_t hamming_distance = 0; | ||
38 | * | ||
39 | * printf("byteswap vector =\n"); | ||
40 | * for(unsigned int ii=0; ii<N; ++ii){ | ||
41 | * printf(" %.16lx\n", bitstring[ii]); | ||
42 | * } | ||
43 | * | ||
44 | * volk_64u_byteswap(bitstring, N); | ||
45 | * | ||
46 | * printf("byteswapped vector =\n"); | ||
47 | * for(unsigned int ii=0; ii<N; ++ii){ | ||
48 | * printf(" %.16lx\n", bitstring[ii]); | ||
49 | * } | ||
50 | * \endcode | ||
51 | */ | ||
52 | |||
53 | #ifndef INCLUDED_volk_64u_byteswap_u_H | ||
54 | #define INCLUDED_volk_64u_byteswap_u_H | ||
55 | |||
56 | #include <inttypes.h> | ||
57 | #include <stdio.h> | ||
58 | |||
59 | #ifdef LV_HAVE_SSE2 | ||
60 | #include <emmintrin.h> | ||
61 | |||
62 | 2 | static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points) | |
63 | { | ||
64 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
65 | __m128i input, byte1, byte2, byte3, byte4, output; | ||
66 | 2 | __m128i byte2mask = _mm_set1_epi32(0x00FF0000); | |
67 | 2 | __m128i byte3mask = _mm_set1_epi32(0x0000FF00); | |
68 | 2 | uint64_t number = 0; | |
69 | 2 | const unsigned int halfPoints = num_points / 2; | |
70 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
71 | // Load the 32t values, increment inputPtr later since we're doing it in-place. | ||
72 | 131070 | input = _mm_loadu_si128((__m128i*)inputPtr); | |
73 | |||
74 | // Do the four shifts | ||
75 | 131070 | byte1 = _mm_slli_epi32(input, 24); | |
76 | 131070 | byte2 = _mm_slli_epi32(input, 8); | |
77 | 131070 | byte3 = _mm_srli_epi32(input, 8); | |
78 | 131070 | byte4 = _mm_srli_epi32(input, 24); | |
79 | // Or bytes together | ||
80 | 131070 | output = _mm_or_si128(byte1, byte4); | |
81 | 131070 | byte2 = _mm_and_si128(byte2, byte2mask); | |
82 | 131070 | output = _mm_or_si128(output, byte2); | |
83 | 131070 | byte3 = _mm_and_si128(byte3, byte3mask); | |
84 | 131070 | output = _mm_or_si128(output, byte3); | |
85 | |||
86 | // Reorder the two words | ||
87 | 131070 | output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); | |
88 | |||
89 | // Store the results | ||
90 | _mm_storeu_si128((__m128i*)inputPtr, output); | ||
91 | 131070 | inputPtr += 4; | |
92 | } | ||
93 | |||
94 | // Byteswap any remaining points: | ||
95 | 2 | number = halfPoints * 2; | |
96 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
97 | 2 | uint32_t output1 = *inputPtr; | |
98 | 2 | uint32_t output2 = inputPtr[1]; | |
99 | |||
100 | 2 | output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | | |
101 | 2 | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); | |
102 | |||
103 | 2 | output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | | |
104 | 2 | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); | |
105 | |||
106 | 2 | *inputPtr++ = output2; | |
107 | 2 | *inputPtr++ = output1; | |
108 | } | ||
109 | 2 | } | |
110 | #endif /* LV_HAVE_SSE2 */ | ||
111 | |||
112 | |||
113 | #ifdef LV_HAVE_GENERIC | ||
114 | |||
115 | 2 | static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, | |
116 | unsigned int num_points) | ||
117 | { | ||
118 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
119 | unsigned int point; | ||
120 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (point = 0; point < num_points; point++) { |
121 | 262142 | uint32_t output1 = *inputPtr; | |
122 | 262142 | uint32_t output2 = inputPtr[1]; | |
123 | |||
124 | 262142 | output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | | |
125 | 262142 | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); | |
126 | |||
127 | 262142 | output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | | |
128 | 262142 | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); | |
129 | |||
130 | 262142 | *inputPtr++ = output2; | |
131 | 262142 | *inputPtr++ = output1; | |
132 | } | ||
133 | 2 | } | |
134 | #endif /* LV_HAVE_GENERIC */ | ||
135 | |||
136 | #if LV_HAVE_AVX2 | ||
137 | #include <immintrin.h> | ||
138 | 2 | static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points) | |
139 | { | ||
140 | 2 | unsigned int number = 0; | |
141 | |||
142 | 2 | const unsigned int nPerSet = 4; | |
143 | 2 | const uint64_t nSets = num_points / nPerSet; | |
144 | |||
145 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
146 | |||
147 | 2 | const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, | |
148 | 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, | ||
149 | 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; | ||
150 | |||
151 | 2 | const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); | |
152 | |||
153 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < nSets; number++) { |
154 | |||
155 | // Load the 32t values, increment inputPtr later since we're doing it in-place. | ||
156 | 65534 | const __m256i input = _mm256_load_si256((__m256i*)inputPtr); | |
157 | 65534 | const __m256i output = _mm256_shuffle_epi8(input, myShuffle); | |
158 | |||
159 | // Store the results | ||
160 | _mm256_store_si256((__m256i*)inputPtr, output); | ||
161 | |||
162 | /* inputPtr is 32bit so increment twice */ | ||
163 | 65534 | inputPtr += 2 * nPerSet; | |
164 | } | ||
165 | |||
166 | // Byteswap any remaining points: | ||
167 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (number = nSets * nPerSet; number < num_points; ++number) { |
168 | 6 | uint32_t output1 = *inputPtr; | |
169 | 6 | uint32_t output2 = inputPtr[1]; | |
170 | 6 | uint32_t out1 = | |
171 | 6 | ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | | |
172 | 6 | (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); | |
173 | |||
174 | 6 | uint32_t out2 = | |
175 | 6 | ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | | |
176 | 6 | (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); | |
177 | 6 | *inputPtr++ = out2; | |
178 | 6 | *inputPtr++ = out1; | |
179 | } | ||
180 | 2 | } | |
181 | |||
182 | #endif /* LV_HAVE_AVX2 */ | ||
183 | |||
184 | |||
185 | #if LV_HAVE_SSSE3 | ||
186 | #include <tmmintrin.h> | ||
187 | 2 | static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, | |
188 | unsigned int num_points) | ||
189 | { | ||
190 | 2 | unsigned int number = 0; | |
191 | |||
192 | 2 | const unsigned int nPerSet = 2; | |
193 | 2 | const uint64_t nSets = num_points / nPerSet; | |
194 | |||
195 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
196 | |||
197 | 2 | uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; | |
198 | |||
199 | 2 | const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector); | |
200 | |||
201 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < nSets; number++) { |
202 | |||
203 | // Load the 32t values, increment inputPtr later since we're doing it in-place. | ||
204 | 131070 | const __m128i input = _mm_load_si128((__m128i*)inputPtr); | |
205 | 131070 | const __m128i output = _mm_shuffle_epi8(input, myShuffle); | |
206 | |||
207 | // Store the results | ||
208 | _mm_store_si128((__m128i*)inputPtr, output); | ||
209 | |||
210 | /* inputPtr is 32bit so increment twice */ | ||
211 | 131070 | inputPtr += 2 * nPerSet; | |
212 | } | ||
213 | |||
214 | // Byteswap any remaining points: | ||
215 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (number = nSets * nPerSet; number < num_points; ++number) { |
216 | 2 | uint32_t output1 = *inputPtr; | |
217 | 2 | uint32_t output2 = inputPtr[1]; | |
218 | 2 | uint32_t out1 = | |
219 | 2 | ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | | |
220 | 2 | (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); | |
221 | |||
222 | 2 | uint32_t out2 = | |
223 | 2 | ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | | |
224 | 2 | (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); | |
225 | 2 | *inputPtr++ = out2; | |
226 | 2 | *inputPtr++ = out1; | |
227 | } | ||
228 | 2 | } | |
229 | #endif /* LV_HAVE_SSSE3 */ | ||
230 | |||
231 | |||
232 | #ifdef LV_HAVE_NEONV8 | ||
233 | #include <arm_neon.h> | ||
234 | |||
235 | static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points) | ||
236 | { | ||
237 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | ||
238 | const unsigned int n4points = num_points / 4; | ||
239 | uint8x16x2_t input; | ||
240 | uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; | ||
241 | |||
242 | unsigned int number = 0; | ||
243 | for (number = 0; number < n4points; ++number) { | ||
244 | __VOLK_PREFETCH(inputPtr + 8); | ||
245 | input = vld2q_u8((uint8_t*)inputPtr); | ||
246 | input.val[0] = vqtbl1q_u8(input.val[0], idx); | ||
247 | input.val[1] = vqtbl1q_u8(input.val[1], idx); | ||
248 | vst2q_u8((uint8_t*)inputPtr, input); | ||
249 | |||
250 | inputPtr += 8; | ||
251 | } | ||
252 | |||
253 | for (number = n4points * 4; number < num_points; ++number) { | ||
254 | uint32_t output1 = *inputPtr; | ||
255 | uint32_t output2 = inputPtr[1]; | ||
256 | |||
257 | output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | | ||
258 | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); | ||
259 | output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | | ||
260 | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); | ||
261 | |||
262 | *inputPtr++ = output2; | ||
263 | *inputPtr++ = output1; | ||
264 | } | ||
265 | } | ||
266 | #else | ||
267 | #ifdef LV_HAVE_NEON | ||
268 | #include <arm_neon.h> | ||
269 | |||
270 | static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points) | ||
271 | { | ||
272 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | ||
273 | unsigned int number = 0; | ||
274 | unsigned int n8points = num_points / 4; | ||
275 | |||
276 | uint8x8x4_t input_table; | ||
277 | uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; | ||
278 | uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; | ||
279 | |||
280 | /* these magic numbers are used as byte-indices in the LUT. | ||
281 | they are pre-computed to save time. A simple C program | ||
282 | can calculate them; for example for lookup01: | ||
283 | uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; | ||
284 | for(ii=0; ii < 8; ++ii) { | ||
285 | index += ((uint64_t)(*(chars+ii))) << (ii*8); | ||
286 | } | ||
287 | */ | ||
288 | int_lookup01 = vcreate_u8(2269495096316185); | ||
289 | int_lookup23 = vcreate_u8(146949840772469531); | ||
290 | int_lookup45 = vcreate_u8(291630186448622877); | ||
291 | int_lookup67 = vcreate_u8(436310532124776223); | ||
292 | |||
293 | for (number = 0; number < n8points; ++number) { | ||
294 | input_table = vld4_u8((uint8_t*)inputPtr); | ||
295 | swapped_int01 = vtbl4_u8(input_table, int_lookup01); | ||
296 | swapped_int23 = vtbl4_u8(input_table, int_lookup23); | ||
297 | swapped_int45 = vtbl4_u8(input_table, int_lookup45); | ||
298 | swapped_int67 = vtbl4_u8(input_table, int_lookup67); | ||
299 | vst1_u8((uint8_t*)inputPtr, swapped_int01); | ||
300 | vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23); | ||
301 | vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45); | ||
302 | vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67); | ||
303 | |||
304 | inputPtr += 4; | ||
305 | } | ||
306 | |||
307 | for (number = n8points * 4; number < num_points; ++number) { | ||
308 | uint32_t output1 = *inputPtr; | ||
309 | uint32_t output2 = inputPtr[1]; | ||
310 | |||
311 | output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | | ||
312 | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); | ||
313 | output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | | ||
314 | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); | ||
315 | |||
316 | *inputPtr++ = output2; | ||
317 | *inputPtr++ = output1; | ||
318 | } | ||
319 | } | ||
320 | #endif /* LV_HAVE_NEON */ | ||
321 | #endif | ||
322 | |||
323 | #endif /* INCLUDED_volk_64u_byteswap_u_H */ | ||
324 | #ifndef INCLUDED_volk_64u_byteswap_a_H | ||
325 | #define INCLUDED_volk_64u_byteswap_a_H | ||
326 | |||
327 | #include <inttypes.h> | ||
328 | #include <stdio.h> | ||
329 | |||
330 | |||
331 | #ifdef LV_HAVE_SSE2 | ||
332 | #include <emmintrin.h> | ||
333 | |||
334 | 2 | static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points) | |
335 | { | ||
336 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
337 | __m128i input, byte1, byte2, byte3, byte4, output; | ||
338 | 2 | __m128i byte2mask = _mm_set1_epi32(0x00FF0000); | |
339 | 2 | __m128i byte3mask = _mm_set1_epi32(0x0000FF00); | |
340 | 2 | uint64_t number = 0; | |
341 | 2 | const unsigned int halfPoints = num_points / 2; | |
342 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
343 | // Load the 32t values, increment inputPtr later since we're doing it in-place. | ||
344 | 131070 | input = _mm_load_si128((__m128i*)inputPtr); | |
345 | |||
346 | // Do the four shifts | ||
347 | 131070 | byte1 = _mm_slli_epi32(input, 24); | |
348 | 131070 | byte2 = _mm_slli_epi32(input, 8); | |
349 | 131070 | byte3 = _mm_srli_epi32(input, 8); | |
350 | 131070 | byte4 = _mm_srli_epi32(input, 24); | |
351 | // Or bytes together | ||
352 | 131070 | output = _mm_or_si128(byte1, byte4); | |
353 | 131070 | byte2 = _mm_and_si128(byte2, byte2mask); | |
354 | 131070 | output = _mm_or_si128(output, byte2); | |
355 | 131070 | byte3 = _mm_and_si128(byte3, byte3mask); | |
356 | 131070 | output = _mm_or_si128(output, byte3); | |
357 | |||
358 | // Reorder the two words | ||
359 | 131070 | output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); | |
360 | |||
361 | // Store the results | ||
362 | _mm_store_si128((__m128i*)inputPtr, output); | ||
363 | 131070 | inputPtr += 4; | |
364 | } | ||
365 | |||
366 | // Byteswap any remaining points: | ||
367 | 2 | number = halfPoints * 2; | |
368 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
369 | 2 | uint32_t output1 = *inputPtr; | |
370 | 2 | uint32_t output2 = inputPtr[1]; | |
371 | |||
372 | 2 | output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | | |
373 | 2 | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); | |
374 | |||
375 | 2 | output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | | |
376 | 2 | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); | |
377 | |||
378 | 2 | *inputPtr++ = output2; | |
379 | 2 | *inputPtr++ = output1; | |
380 | } | ||
381 | 2 | } | |
382 | #endif /* LV_HAVE_SSE2 */ | ||
383 | |||
384 | #if LV_HAVE_AVX2 | ||
385 | #include <immintrin.h> | ||
386 | 2 | static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points) | |
387 | { | ||
388 | 2 | unsigned int number = 0; | |
389 | |||
390 | 2 | const unsigned int nPerSet = 4; | |
391 | 2 | const uint64_t nSets = num_points / nPerSet; | |
392 | |||
393 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
394 | |||
395 | 2 | const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, | |
396 | 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, | ||
397 | 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; | ||
398 | |||
399 | 2 | const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); | |
400 | |||
401 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < nSets; number++) { |
402 | // Load the 32t values, increment inputPtr later since we're doing it in-place. | ||
403 | 65534 | const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); | |
404 | 65534 | const __m256i output = _mm256_shuffle_epi8(input, myShuffle); | |
405 | |||
406 | // Store the results | ||
407 | _mm256_storeu_si256((__m256i*)inputPtr, output); | ||
408 | |||
409 | /* inputPtr is 32bit so increment twice */ | ||
410 | 65534 | inputPtr += 2 * nPerSet; | |
411 | } | ||
412 | |||
413 | // Byteswap any remaining points: | ||
414 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (number = nSets * nPerSet; number < num_points; ++number) { |
415 | 6 | uint32_t output1 = *inputPtr; | |
416 | 6 | uint32_t output2 = inputPtr[1]; | |
417 | 6 | uint32_t out1 = | |
418 | 6 | ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | | |
419 | 6 | (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); | |
420 | |||
421 | 6 | uint32_t out2 = | |
422 | 6 | ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | | |
423 | 6 | (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); | |
424 | 6 | *inputPtr++ = out2; | |
425 | 6 | *inputPtr++ = out1; | |
426 | } | ||
427 | 2 | } | |
428 | |||
429 | #endif /* LV_HAVE_AVX2 */ | ||
430 | |||
431 | |||
432 | #if LV_HAVE_SSSE3 | ||
433 | #include <tmmintrin.h> | ||
434 | 2 | static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, | |
435 | unsigned int num_points) | ||
436 | { | ||
437 | 2 | unsigned int number = 0; | |
438 | |||
439 | 2 | const unsigned int nPerSet = 2; | |
440 | 2 | const uint64_t nSets = num_points / nPerSet; | |
441 | |||
442 | 2 | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
443 | |||
444 | 2 | uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; | |
445 | |||
446 | 2 | const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector); | |
447 | |||
448 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < nSets; number++) { |
449 | // Load the 32t values, increment inputPtr later since we're doing it in-place. | ||
450 | 131070 | const __m128i input = _mm_loadu_si128((__m128i*)inputPtr); | |
451 | 131070 | const __m128i output = _mm_shuffle_epi8(input, myShuffle); | |
452 | |||
453 | // Store the results | ||
454 | _mm_storeu_si128((__m128i*)inputPtr, output); | ||
455 | |||
456 | /* inputPtr is 32bit so increment twice */ | ||
457 | 131070 | inputPtr += 2 * nPerSet; | |
458 | } | ||
459 | |||
460 | // Byteswap any remaining points: | ||
461 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (number = nSets * nPerSet; number < num_points; ++number) { |
462 | 2 | uint32_t output1 = *inputPtr; | |
463 | 2 | uint32_t output2 = inputPtr[1]; | |
464 | 2 | uint32_t out1 = | |
465 | 2 | ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | | |
466 | 2 | (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); | |
467 | |||
468 | 2 | uint32_t out2 = | |
469 | 2 | ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | | |
470 | 2 | (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); | |
471 | 2 | *inputPtr++ = out2; | |
472 | 2 | *inputPtr++ = out1; | |
473 | } | ||
474 | 2 | } | |
475 | #endif /* LV_HAVE_SSSE3 */ | ||
476 | |||
477 | #ifdef LV_HAVE_GENERIC | ||
478 | |||
479 | ✗ | static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, | |
480 | unsigned int num_points) | ||
481 | { | ||
482 | ✗ | uint32_t* inputPtr = (uint32_t*)intsToSwap; | |
483 | unsigned int point; | ||
484 | ✗ | for (point = 0; point < num_points; point++) { | |
485 | ✗ | uint32_t output1 = *inputPtr; | |
486 | ✗ | uint32_t output2 = inputPtr[1]; | |
487 | |||
488 | ✗ | output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | | |
489 | ✗ | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); | |
490 | |||
491 | ✗ | output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | | |
492 | ✗ | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); | |
493 | |||
494 | ✗ | *inputPtr++ = output2; | |
495 | ✗ | *inputPtr++ = output1; | |
496 | } | ||
497 | ✗ | } | |
498 | #endif /* LV_HAVE_GENERIC */ | ||
499 | |||
500 | |||
501 | #endif /* INCLUDED_volk_64u_byteswap_a_H */ | ||
502 |