Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2018 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32u_reverse_32u | ||
12 | * | ||
13 | * \b bit reversal of the input 32 bit word | ||
14 | |||
15 | * <b>Dispatcher Prototype</b> | ||
16 | * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int | ||
17 | num_points); | ||
18 | * \endcode | ||
19 | * | ||
20 | * \b Inputs | ||
21 | * \li inputVector: The input vector | ||
22 | * \li num_points The number of data points. | ||
23 | * | ||
24 | * \b Outputs | ||
25 | * \li outputVector: The vector where the results will be stored, which is the | ||
26 | bit-reversed input | ||
27 | * | ||
28 | * \endcode | ||
29 | */ | ||
30 | #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H | ||
31 | struct dword_split { | ||
32 | int b00 : 1; | ||
33 | int b01 : 1; | ||
34 | int b02 : 1; | ||
35 | int b03 : 1; | ||
36 | int b04 : 1; | ||
37 | int b05 : 1; | ||
38 | int b06 : 1; | ||
39 | int b07 : 1; | ||
40 | int b08 : 1; | ||
41 | int b09 : 1; | ||
42 | int b10 : 1; | ||
43 | int b11 : 1; | ||
44 | int b12 : 1; | ||
45 | int b13 : 1; | ||
46 | int b14 : 1; | ||
47 | int b15 : 1; | ||
48 | int b16 : 1; | ||
49 | int b17 : 1; | ||
50 | int b18 : 1; | ||
51 | int b19 : 1; | ||
52 | int b20 : 1; | ||
53 | int b21 : 1; | ||
54 | int b22 : 1; | ||
55 | int b23 : 1; | ||
56 | int b24 : 1; | ||
57 | int b25 : 1; | ||
58 | int b26 : 1; | ||
59 | int b27 : 1; | ||
60 | int b28 : 1; | ||
61 | int b29 : 1; | ||
62 | int b30 : 1; | ||
63 | int b31 : 1; | ||
64 | }; | ||
65 | struct char_split { | ||
66 | uint8_t b00 : 1; | ||
67 | uint8_t b01 : 1; | ||
68 | uint8_t b02 : 1; | ||
69 | uint8_t b03 : 1; | ||
70 | uint8_t b04 : 1; | ||
71 | uint8_t b05 : 1; | ||
72 | uint8_t b06 : 1; | ||
73 | uint8_t b07 : 1; | ||
74 | }; | ||
75 | |||
76 | // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain | ||
77 | // http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||
78 | static const unsigned char BitReverseTable256[] = { | ||
79 | 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, | ||
80 | 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, | ||
81 | 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, | ||
82 | 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, | ||
83 | 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, | ||
84 | 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, | ||
85 | 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, | ||
86 | 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, | ||
87 | 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, | ||
88 | 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, | ||
89 | 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, | ||
90 | 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, | ||
91 | 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, | ||
92 | 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, | ||
93 | 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, | ||
94 | 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, | ||
95 | 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, | ||
96 | 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, | ||
97 | 0x3F, 0xBF, 0x7F, 0xFF | ||
98 | }; | ||
99 | #ifdef LV_HAVE_GENERIC | ||
100 | 2 | static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, | |
101 | const uint32_t* in, | ||
102 | unsigned int num_points) | ||
103 | { | ||
104 | 2 | const struct dword_split* in_ptr = (const struct dword_split*)in; | |
105 | 2 | struct dword_split* out_ptr = (struct dword_split*)out; | |
106 | 2 | unsigned int number = 0; | |
107 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
108 | 262142 | out_ptr->b00 = in_ptr->b31; | |
109 | 262142 | out_ptr->b01 = in_ptr->b30; | |
110 | 262142 | out_ptr->b02 = in_ptr->b29; | |
111 | 262142 | out_ptr->b03 = in_ptr->b28; | |
112 | 262142 | out_ptr->b04 = in_ptr->b27; | |
113 | 262142 | out_ptr->b05 = in_ptr->b26; | |
114 | 262142 | out_ptr->b06 = in_ptr->b25; | |
115 | 262142 | out_ptr->b07 = in_ptr->b24; | |
116 | 262142 | out_ptr->b08 = in_ptr->b23; | |
117 | 262142 | out_ptr->b09 = in_ptr->b22; | |
118 | 262142 | out_ptr->b10 = in_ptr->b21; | |
119 | 262142 | out_ptr->b11 = in_ptr->b20; | |
120 | 262142 | out_ptr->b12 = in_ptr->b19; | |
121 | 262142 | out_ptr->b13 = in_ptr->b18; | |
122 | 262142 | out_ptr->b14 = in_ptr->b17; | |
123 | 262142 | out_ptr->b15 = in_ptr->b16; | |
124 | 262142 | out_ptr->b16 = in_ptr->b15; | |
125 | 262142 | out_ptr->b17 = in_ptr->b14; | |
126 | 262142 | out_ptr->b18 = in_ptr->b13; | |
127 | 262142 | out_ptr->b19 = in_ptr->b12; | |
128 | 262142 | out_ptr->b20 = in_ptr->b11; | |
129 | 262142 | out_ptr->b21 = in_ptr->b10; | |
130 | 262142 | out_ptr->b22 = in_ptr->b09; | |
131 | 262142 | out_ptr->b23 = in_ptr->b08; | |
132 | 262142 | out_ptr->b24 = in_ptr->b07; | |
133 | 262142 | out_ptr->b25 = in_ptr->b06; | |
134 | 262142 | out_ptr->b26 = in_ptr->b05; | |
135 | 262142 | out_ptr->b27 = in_ptr->b04; | |
136 | 262142 | out_ptr->b28 = in_ptr->b03; | |
137 | 262142 | out_ptr->b29 = in_ptr->b02; | |
138 | 262142 | out_ptr->b30 = in_ptr->b01; | |
139 | 262142 | out_ptr->b31 = in_ptr->b00; | |
140 | 262142 | ++in_ptr; | |
141 | 262142 | ++out_ptr; | |
142 | } | ||
143 | 2 | } | |
144 | #endif /* LV_HAVE_GENERIC */ | ||
145 | |||
146 | #ifdef LV_HAVE_GENERIC | ||
147 | 2 | static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, | |
148 | const uint32_t* in, | ||
149 | unsigned int num_points) | ||
150 | { | ||
151 | 2 | const uint32_t* in_ptr = in; | |
152 | 2 | uint32_t* out_ptr = out; | |
153 | 2 | unsigned int number = 0; | |
154 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
155 | 262142 | const struct char_split* in8 = (const struct char_split*)in_ptr; | |
156 | 262142 | struct char_split* out8 = (struct char_split*)out_ptr; | |
157 | |||
158 | 262142 | out8[3].b00 = in8[0].b07; | |
159 | 262142 | out8[3].b01 = in8[0].b06; | |
160 | 262142 | out8[3].b02 = in8[0].b05; | |
161 | 262142 | out8[3].b03 = in8[0].b04; | |
162 | 262142 | out8[3].b04 = in8[0].b03; | |
163 | 262142 | out8[3].b05 = in8[0].b02; | |
164 | 262142 | out8[3].b06 = in8[0].b01; | |
165 | 262142 | out8[3].b07 = in8[0].b00; | |
166 | |||
167 | 262142 | out8[2].b00 = in8[1].b07; | |
168 | 262142 | out8[2].b01 = in8[1].b06; | |
169 | 262142 | out8[2].b02 = in8[1].b05; | |
170 | 262142 | out8[2].b03 = in8[1].b04; | |
171 | 262142 | out8[2].b04 = in8[1].b03; | |
172 | 262142 | out8[2].b05 = in8[1].b02; | |
173 | 262142 | out8[2].b06 = in8[1].b01; | |
174 | 262142 | out8[2].b07 = in8[1].b00; | |
175 | |||
176 | 262142 | out8[1].b00 = in8[2].b07; | |
177 | 262142 | out8[1].b01 = in8[2].b06; | |
178 | 262142 | out8[1].b02 = in8[2].b05; | |
179 | 262142 | out8[1].b03 = in8[2].b04; | |
180 | 262142 | out8[1].b04 = in8[2].b03; | |
181 | 262142 | out8[1].b05 = in8[2].b02; | |
182 | 262142 | out8[1].b06 = in8[2].b01; | |
183 | 262142 | out8[1].b07 = in8[2].b00; | |
184 | |||
185 | 262142 | out8[0].b00 = in8[3].b07; | |
186 | 262142 | out8[0].b01 = in8[3].b06; | |
187 | 262142 | out8[0].b02 = in8[3].b05; | |
188 | 262142 | out8[0].b03 = in8[3].b04; | |
189 | 262142 | out8[0].b04 = in8[3].b03; | |
190 | 262142 | out8[0].b05 = in8[3].b02; | |
191 | 262142 | out8[0].b06 = in8[3].b01; | |
192 | 262142 | out8[0].b07 = in8[3].b00; | |
193 | 262142 | ++in_ptr; | |
194 | 262142 | ++out_ptr; | |
195 | } | ||
196 | 2 | } | |
197 | #endif /* LV_HAVE_GENERIC */ | ||
198 | |||
199 | // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain | ||
200 | // http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||
201 | #ifdef LV_HAVE_GENERIC | ||
202 | static inline void | ||
203 | 2 | volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points) | |
204 | { | ||
205 | 2 | const uint32_t* in_ptr = in; | |
206 | 2 | uint32_t* out_ptr = out; | |
207 | 2 | unsigned int number = 0; | |
208 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
209 | 262142 | *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) | | |
210 | 262142 | (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | | |
211 | 262142 | (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | | |
212 | 262142 | (BitReverseTable256[(*in_ptr >> 24) & 0xff]); | |
213 | 262142 | ++in_ptr; | |
214 | 262142 | ++out_ptr; | |
215 | } | ||
216 | 2 | } | |
217 | #endif /* LV_HAVE_GENERIC */ | ||
218 | |||
219 | // Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public | ||
220 | // domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits | ||
221 | #ifdef LV_HAVE_GENERIC | ||
222 | static inline void | ||
223 | 2 | volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points) | |
224 | { | ||
225 | 2 | const uint32_t* in_ptr = in; | |
226 | 2 | uint32_t* out_ptr = out; | |
227 | const uint8_t* in8; | ||
228 | uint8_t* out8; | ||
229 | 2 | unsigned int number = 0; | |
230 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
231 | 262142 | in8 = (const uint8_t*)in_ptr; | |
232 | 262142 | out8 = (uint8_t*)out_ptr; | |
233 | 262142 | out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; | |
234 | 262142 | out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; | |
235 | 262142 | out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; | |
236 | 262142 | out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; | |
237 | 262142 | ++in_ptr; | |
238 | 262142 | ++out_ptr; | |
239 | } | ||
240 | 2 | } | |
241 | #endif /* LV_HAVE_GENERIC */ | ||
242 | |||
243 | #ifdef LV_HAVE_GENERIC | ||
244 | // Current gr-pager implementation | ||
245 | static inline void | ||
246 | 2 | volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points) | |
247 | { | ||
248 | 2 | const uint32_t* in_ptr = in; | |
249 | 2 | uint32_t* out_ptr = out; | |
250 | const uint8_t* in8; | ||
251 | uint8_t* out8; | ||
252 | 2 | unsigned int number = 0; | |
253 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
254 | 262142 | in8 = (const uint8_t*)in_ptr; | |
255 | 262142 | out8 = (uint8_t*)out_ptr; | |
256 | 262142 | out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023; | |
257 | 262142 | out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023; | |
258 | 262142 | out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023; | |
259 | 262142 | out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023; | |
260 | 262142 | ++in_ptr; | |
261 | 262142 | ++out_ptr; | |
262 | } | ||
263 | 2 | } | |
264 | #endif /* LV_HAVE_GENERIC */ | ||
265 | |||
266 | // After lengthy thought and quite a bit of whiteboarding: | ||
267 | #ifdef LV_HAVE_GENERIC | ||
268 | 2 | static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, | |
269 | const uint32_t* in, | ||
270 | unsigned int num_points) | ||
271 | { | ||
272 | 2 | const uint32_t* in_ptr = in; | |
273 | 2 | uint32_t* out_ptr = out; | |
274 | 2 | unsigned int number = 0; | |
275 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
276 | 262142 | uint32_t tmp = *in_ptr; | |
277 | /* permute uint16: | ||
278 | The idea is to simply shift the lower 16 bit up, and the upper 16 bit down. | ||
279 | */ | ||
280 | 262142 | tmp = (tmp << 16) | (tmp >> 16); | |
281 | /* permute bytes: | ||
282 | shift up by 1 B first, then only consider even bytes, and OR with the unshifted | ||
283 | even bytes | ||
284 | */ | ||
285 | 262142 | tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); | |
286 | /* permute 4bit tuples: | ||
287 | Same idea, but the "consideration" mask expression becomes unwieldy | ||
288 | */ | ||
289 | 262142 | tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | | |
290 | 262142 | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); | |
291 | /* permute 2bit tuples: | ||
292 | Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 = | ||
293 | 3; we need those every 4b, which coincides with a hex digit! | ||
294 | */ | ||
295 | 262142 | tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); | |
296 | /* permute odd/even: | ||
297 | 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = | ||
298 | 0x05! | ||
299 | */ | ||
300 | 262142 | tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); | |
301 | |||
302 | 262142 | *out_ptr = tmp; | |
303 | 262142 | ++in_ptr; | |
304 | 262142 | ++out_ptr; | |
305 | } | ||
306 | 2 | } | |
307 | #endif /* LV_HAVE_GENERIC */ | ||
308 | #ifdef LV_HAVE_GENERIC | ||
309 | 2 | static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, | |
310 | const uint32_t* in, | ||
311 | unsigned int num_points) | ||
312 | { | ||
313 | // same stuff as top_down, inverted order (permutation matrices don't care, you know!) | ||
314 | 2 | const uint32_t* in_ptr = in; | |
315 | 2 | uint32_t* out_ptr = out; | |
316 | 2 | unsigned int number = 0; | |
317 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; ++number) { |
318 | 262142 | uint32_t tmp = *in_ptr; | |
319 | 262142 | tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); | |
320 | 262142 | tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); | |
321 | 262142 | tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | | |
322 | 262142 | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); | |
323 | 262142 | tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); | |
324 | 262142 | tmp = (tmp << 16) | (tmp >> 16); | |
325 | |||
326 | 262142 | *out_ptr = tmp; | |
327 | 262142 | ++in_ptr; | |
328 | 262142 | ++out_ptr; | |
329 | } | ||
330 | 2 | } | |
331 | #endif /* LV_HAVE_GENERIC */ | ||
332 | |||
333 | #ifdef LV_HAVE_NEONV8 | ||
334 | #include <arm_neon.h> | ||
335 | |||
336 | static inline void | ||
337 | volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points) | ||
338 | { | ||
339 | const uint32_t* in_ptr = in; | ||
340 | uint32_t* out_ptr = out; | ||
341 | |||
342 | const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; | ||
343 | |||
344 | const unsigned int quarterPoints = num_points / 4; | ||
345 | unsigned int number = 0; | ||
346 | for (; number < quarterPoints; ++number) { | ||
347 | __VOLK_PREFETCH(in_ptr + 4); | ||
348 | uint32x4_t x = vld1q_u32(in_ptr); | ||
349 | uint32x4_t z = | ||
350 | vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx)); | ||
351 | vst1q_u32(out_ptr, z); | ||
352 | in_ptr += 4; | ||
353 | out_ptr += 4; | ||
354 | } | ||
355 | number = quarterPoints * 4; | ||
356 | for (; number < num_points; ++number) { | ||
357 | *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) | | ||
358 | (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | | ||
359 | (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | | ||
360 | (BitReverseTable256[(*in_ptr >> 24) & 0xff]); | ||
361 | ++in_ptr; | ||
362 | ++out_ptr; | ||
363 | } | ||
364 | } | ||
365 | |||
366 | #endif /* LV_HAVE_NEONV8 */ | ||
367 | |||
368 | #ifdef LV_HAVE_NEON | ||
369 | #include <arm_neon.h> | ||
370 | |||
371 | #if defined(__aarch64__) | ||
372 | #define DO_RBIT \ | ||
373 | __VOLK_ASM("rbit %w[result], %w[value]" \ | ||
374 | : [result] "=r"(*out_ptr) \ | ||
375 | : [value] "r"(*in_ptr) \ | ||
376 | :); \ | ||
377 | in_ptr++; \ | ||
378 | out_ptr++; | ||
379 | #else | ||
380 | #define DO_RBIT \ | ||
381 | __VOLK_ASM("rbit %[result], %[value]" \ | ||
382 | : [result] "=r"(*out_ptr) \ | ||
383 | : [value] "r"(*in_ptr) \ | ||
384 | :); \ | ||
385 | in_ptr++; \ | ||
386 | out_ptr++; | ||
387 | #endif | ||
388 | |||
389 | static inline void | ||
390 | volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points) | ||
391 | { | ||
392 | |||
393 | const uint32_t* in_ptr = in; | ||
394 | uint32_t* out_ptr = out; | ||
395 | const unsigned int eighthPoints = num_points / 8; | ||
396 | unsigned int number = 0; | ||
397 | for (; number < eighthPoints; ++number) { | ||
398 | __VOLK_PREFETCH(in_ptr + 8); | ||
399 | DO_RBIT; | ||
400 | DO_RBIT; | ||
401 | DO_RBIT; | ||
402 | DO_RBIT; | ||
403 | DO_RBIT; | ||
404 | DO_RBIT; | ||
405 | DO_RBIT; | ||
406 | DO_RBIT; | ||
407 | } | ||
408 | number = eighthPoints * 8; | ||
409 | for (; number < num_points; ++number) { | ||
410 | DO_RBIT; | ||
411 | } | ||
412 | } | ||
413 | #undef DO_RBIT | ||
414 | #endif /* LV_HAVE_NEON */ | ||
415 | |||
416 | |||
417 | #endif /* INCLUDED_volk_32u_reverse_32u_u_H */ | ||
418 |