GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16u_byteswap.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 68 79 86.1%
Functions: 5 7 71.4%
Branches: 16 18 88.9%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16u_byteswap
12 *
13 * \b Overview
14 *
15 * Byteswaps (in-place) an aligned vector of int16_t's.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_16u_byteswap(uint16_t* intsToSwap, unsigned int num_points)
20 * \endcode
21 *
22 * \b Inputs
23 * \li intsToSwap: The vector of data to byte swap.
24 * \li num_points: The number of data points.
25 *
26 * \b Outputs
27 * \li intsToSwap: returns as an in-place calculation.
28 *
29 * \b Example
30 * \code
31 * int N = 10000;
32 *
33 * <FIXME>
34 *
35 * volk_16u_byteswap(x, N);
36 *
37 * \endcode
38 */
39
40 #ifndef INCLUDED_volk_16u_byteswap_u_H
41 #define INCLUDED_volk_16u_byteswap_u_H
42
43 #include <inttypes.h>
44 #include <stdio.h>
45
46 #ifdef LV_HAVE_GENERIC
47
48 4 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
49 unsigned int num_points)
50 {
51 4 uint16_t* inputPtr = intsToSwap;
52
2/2
✓ Branch 0 taken 262156 times.
✓ Branch 1 taken 4 times.
262160 for (unsigned int point = 0; point < num_points; point++) {
53 262156 uint16_t output = *inputPtr;
54 262156 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
55 262156 *inputPtr = output;
56 262156 inputPtr++;
57 }
58 4 }
59 #endif /* LV_HAVE_GENERIC */
60
61
62 #if LV_HAVE_AVX2
63 #include <immintrin.h>
64 2 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
65 {
66 unsigned int number;
67
68 2 const unsigned int nPerSet = 16;
69 2 const uint64_t nSets = num_points / nPerSet;
70
71 2 uint16_t* inputPtr = (uint16_t*)intsToSwap;
72
73 2 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
74 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
75 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
76
77 2 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
78
79
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < nSets; number++) {
80 // Load the 32t values, increment inputPtr later since we're doing it in-place.
81 16382 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
82 16382 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
83
84 // Store the results
85 _mm256_store_si256((__m256i*)inputPtr, output);
86 16382 inputPtr += nPerSet;
87 }
88
89 // Byteswap any remaining points:
90
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (number = nPerSet * nSets; number < num_points; number++) {
91 30 uint16_t outputVal = *inputPtr;
92 30 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
93 30 *inputPtr = outputVal;
94 30 inputPtr++;
95 }
96 2 }
97 #endif /* LV_HAVE_AVX2 */
98
99
100 #if LV_HAVE_AVX2
101 #include <immintrin.h>
102 2 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
103 {
104 unsigned int number;
105
106 2 const unsigned int nPerSet = 16;
107 2 const uint64_t nSets = num_points / nPerSet;
108
109 2 uint16_t* inputPtr = (uint16_t*)intsToSwap;
110
111 2 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
112 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
113 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
114
115 2 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
116
117
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < nSets; number++) {
118 // Load the 32t values, increment inputPtr later since we're doing it in-place.
119 16382 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
120 16382 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
121
122 // Store the results
123 _mm256_storeu_si256((__m256i*)inputPtr, output);
124 16382 inputPtr += nPerSet;
125 }
126
127 // Byteswap any remaining points:
128
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (number = nPerSet * nSets; number < num_points; number++) {
129 30 uint16_t outputVal = *inputPtr;
130 30 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
131 30 *inputPtr = outputVal;
132 30 inputPtr++;
133 }
134 2 }
135 #endif /* LV_HAVE_AVX2 */
136
137
138 #ifdef LV_HAVE_SSE2
139 #include <emmintrin.h>
140
141 2 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
142 {
143 2 unsigned int number = 0;
144 2 uint16_t* inputPtr = intsToSwap;
145 __m128i input, left, right, output;
146
147 2 const unsigned int eighthPoints = num_points / 8;
148
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
149 // Load the 16t values, increment inputPtr later since we're doing it in-place.
150 32766 input = _mm_loadu_si128((__m128i*)inputPtr);
151 // Do the two shifts
152 32766 left = _mm_slli_epi16(input, 8);
153 32766 right = _mm_srli_epi16(input, 8);
154 // Or the left and right halves together
155 32766 output = _mm_or_si128(left, right);
156 // Store the results
157 _mm_storeu_si128((__m128i*)inputPtr, output);
158 32766 inputPtr += 8;
159 }
160
161 // Byteswap any remaining points:
162 2 number = eighthPoints * 8;
163
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
164 14 uint16_t outputVal = *inputPtr;
165 14 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
166 14 *inputPtr = outputVal;
167 14 inputPtr++;
168 }
169 2 }
170 #endif /* LV_HAVE_SSE2 */
171
172
173 #endif /* INCLUDED_volk_16u_byteswap_u_H */
174 #ifndef INCLUDED_volk_16u_byteswap_a_H
175 #define INCLUDED_volk_16u_byteswap_a_H
176
177 #include <inttypes.h>
178 #include <stdio.h>
179
180 #ifdef LV_HAVE_SSE2
181 #include <emmintrin.h>
182
183 2 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
184 {
185 2 uint16_t* inputPtr = intsToSwap;
186 __m128i input, left, right, output;
187
188 2 const unsigned int eighthPoints = num_points / 8;
189
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned int number = 0; number < eighthPoints; number++) {
190 // Load the 16t values, increment inputPtr later since we're doing it in-place.
191 32766 input = _mm_load_si128((__m128i*)inputPtr);
192 // Do the two shifts
193 32766 left = _mm_slli_epi16(input, 8);
194 32766 right = _mm_srli_epi16(input, 8);
195 // Or the left and right halves together
196 32766 output = _mm_or_si128(left, right);
197 // Store the results
198 _mm_store_si128((__m128i*)inputPtr, output);
199 32766 inputPtr += 8;
200 }
201
202 // Byteswap any remaining points:
203 2 volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
204 2 }
205 #endif /* LV_HAVE_SSE2 */
206
207 #ifdef LV_HAVE_NEON
208 #include <arm_neon.h>
209
210 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
211 {
212 unsigned int number;
213 unsigned int eighth_points = num_points / 8;
214 uint16x8_t input, output;
215 uint16_t* inputPtr = intsToSwap;
216
217 for (number = 0; number < eighth_points; number++) {
218 input = vld1q_u16(inputPtr);
219 output = vsriq_n_u16(output, input, 8);
220 output = vsliq_n_u16(output, input, 8);
221 vst1q_u16(inputPtr, output);
222 inputPtr += 8;
223 }
224
225 volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
226 }
227 #endif /* LV_HAVE_NEON */
228
229 #ifdef LV_HAVE_NEON
230 #include <arm_neon.h>
231
232 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
233 unsigned int num_points)
234 {
235 uint16_t* inputPtr = intsToSwap;
236 unsigned int number = 0;
237 unsigned int n16points = num_points / 16;
238
239 uint8x8x4_t input_table;
240 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
241 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
242
243 /* these magic numbers are used as byte-indices in the LUT.
244 they are pre-computed to save time. A simple C program
245 can calculate them; for example for lookup01:
246 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
247 for(ii=0; ii < 8; ++ii) {
248 index += ((uint64_t)(*(chars+ii))) << (ii*8);
249 }
250 */
251 int_lookup01 = vcreate_u8(1232017111498883080);
252 int_lookup23 = vcreate_u8(1376697457175036426);
253 int_lookup45 = vcreate_u8(1521377802851189772);
254 int_lookup67 = vcreate_u8(1666058148527343118);
255
256 for (number = 0; number < n16points; ++number) {
257 input_table = vld4_u8((uint8_t*)inputPtr);
258 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
259 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
260 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
261 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
262 vst1_u8((uint8_t*)inputPtr, swapped_int01);
263 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
264 vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
265 vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
266
267 inputPtr += 16;
268 }
269
270 volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
271 }
272 #endif /* LV_HAVE_NEON */
273
274 #ifdef LV_HAVE_GENERIC
275
276 static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
277 unsigned int num_points)
278 {
279 uint16_t* inputPtr = intsToSwap;
280 for (unsigned int point = 0; point < num_points; point++) {
281 uint16_t output = *inputPtr;
282 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
283 *inputPtr = output;
284 inputPtr++;
285 }
286 }
287 #endif /* LV_HAVE_GENERIC */
288
289 #ifdef LV_HAVE_ORC
290
291 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
292 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
293 {
294 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
295 }
296 #endif /* LV_HAVE_ORC */
297
298
299 #endif /* INCLUDED_volk_16u_byteswap_a_H */
300