Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_8ic_deinterleave_real_8i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Deinterleaves the complex 8-bit char vector into just the I (real) | ||
16 | * vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, | ||
21 | * unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li complexVector: The complex input vector. | ||
25 | * \li num_points: The number of complex data values to be deinterleaved. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li iBuffer: The I buffer output data. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10000; | ||
33 | * | ||
34 | * volk_8ic_deinterleave_real_8i(); | ||
35 | * | ||
36 | * volk_free(x); | ||
37 | * \endcode | ||
38 | */ | ||
39 | |||
40 | #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H | ||
41 | #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H | ||
42 | |||
43 | #include <inttypes.h> | ||
44 | #include <stdio.h> | ||
45 | |||
46 | #ifdef LV_HAVE_AVX2 | ||
47 | #include <immintrin.h> | ||
48 | |||
49 | 2 | static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, | |
50 | const lv_8sc_t* complexVector, | ||
51 | unsigned int num_points) | ||
52 | { | ||
53 | 2 | unsigned int number = 0; | |
54 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
55 | 2 | int8_t* iBufferPtr = iBuffer; | |
56 | 2 | __m256i moveMask1 = _mm256_set_epi8(0x80, | |
57 | 0x80, | ||
58 | 0x80, | ||
59 | 0x80, | ||
60 | 0x80, | ||
61 | 0x80, | ||
62 | 0x80, | ||
63 | 0x80, | ||
64 | 14, | ||
65 | 12, | ||
66 | 10, | ||
67 | 8, | ||
68 | 6, | ||
69 | 4, | ||
70 | 2, | ||
71 | 0, | ||
72 | 0x80, | ||
73 | 0x80, | ||
74 | 0x80, | ||
75 | 0x80, | ||
76 | 0x80, | ||
77 | 0x80, | ||
78 | 0x80, | ||
79 | 0x80, | ||
80 | 14, | ||
81 | 12, | ||
82 | 10, | ||
83 | 8, | ||
84 | 6, | ||
85 | 4, | ||
86 | 2, | ||
87 | 0); | ||
88 | 2 | __m256i moveMask2 = _mm256_set_epi8(14, | |
89 | 12, | ||
90 | 10, | ||
91 | 8, | ||
92 | 6, | ||
93 | 4, | ||
94 | 2, | ||
95 | 0, | ||
96 | 0x80, | ||
97 | 0x80, | ||
98 | 0x80, | ||
99 | 0x80, | ||
100 | 0x80, | ||
101 | 0x80, | ||
102 | 0x80, | ||
103 | 0x80, | ||
104 | 14, | ||
105 | 12, | ||
106 | 10, | ||
107 | 8, | ||
108 | 6, | ||
109 | 4, | ||
110 | 2, | ||
111 | 0, | ||
112 | 0x80, | ||
113 | 0x80, | ||
114 | 0x80, | ||
115 | 0x80, | ||
116 | 0x80, | ||
117 | 0x80, | ||
118 | 0x80, | ||
119 | 0x80); | ||
120 | __m256i complexVal1, complexVal2, outputVal; | ||
121 | |||
122 | 2 | unsigned int thirtysecondPoints = num_points / 32; | |
123 | |||
124 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < thirtysecondPoints; number++) { |
125 | |||
126 | 8190 | complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
127 | 8190 | complexVectorPtr += 32; | |
128 | 8190 | complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
129 | 8190 | complexVectorPtr += 32; | |
130 | |||
131 | 8190 | complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); | |
132 | 8190 | complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); | |
133 | 8190 | outputVal = _mm256_or_si256(complexVal1, complexVal2); | |
134 | 8190 | outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); | |
135 | |||
136 | _mm256_store_si256((__m256i*)iBufferPtr, outputVal); | ||
137 | 8190 | iBufferPtr += 32; | |
138 | } | ||
139 | |||
140 | 2 | number = thirtysecondPoints * 32; | |
141 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
142 | 62 | *iBufferPtr++ = *complexVectorPtr++; | |
143 | 62 | complexVectorPtr++; | |
144 | } | ||
145 | 2 | } | |
146 | #endif /* LV_HAVE_AVX2 */ | ||
147 | |||
148 | |||
149 | #ifdef LV_HAVE_SSSE3 | ||
150 | #include <tmmintrin.h> | ||
151 | |||
152 | 2 | static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, | |
153 | const lv_8sc_t* complexVector, | ||
154 | unsigned int num_points) | ||
155 | { | ||
156 | 2 | unsigned int number = 0; | |
157 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
158 | 2 | int8_t* iBufferPtr = iBuffer; | |
159 | 2 | __m128i moveMask1 = _mm_set_epi8( | |
160 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); | ||
161 | 2 | __m128i moveMask2 = _mm_set_epi8( | |
162 | 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); | ||
163 | __m128i complexVal1, complexVal2, outputVal; | ||
164 | |||
165 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
166 | |||
167 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
168 | 16382 | complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); | |
169 | 16382 | complexVectorPtr += 16; | |
170 | 16382 | complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); | |
171 | 16382 | complexVectorPtr += 16; | |
172 | |||
173 | 16382 | complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); | |
174 | 16382 | complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); | |
175 | |||
176 | 16382 | outputVal = _mm_or_si128(complexVal1, complexVal2); | |
177 | |||
178 | _mm_store_si128((__m128i*)iBufferPtr, outputVal); | ||
179 | 16382 | iBufferPtr += 16; | |
180 | } | ||
181 | |||
182 | 2 | number = sixteenthPoints * 16; | |
183 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
184 | 30 | *iBufferPtr++ = *complexVectorPtr++; | |
185 | 30 | complexVectorPtr++; | |
186 | } | ||
187 | 2 | } | |
188 | #endif /* LV_HAVE_SSSE3 */ | ||
189 | |||
190 | |||
191 | #ifdef LV_HAVE_AVX | ||
192 | #include <immintrin.h> | ||
193 | |||
194 | 2 | static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, | |
195 | const lv_8sc_t* complexVector, | ||
196 | unsigned int num_points) | ||
197 | { | ||
198 | 2 | unsigned int number = 0; | |
199 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
200 | 2 | int8_t* iBufferPtr = iBuffer; | |
201 | 2 | __m128i moveMaskL = _mm_set_epi8( | |
202 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); | ||
203 | 2 | __m128i moveMaskH = _mm_set_epi8( | |
204 | 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); | ||
205 | __m256i complexVal1, complexVal2, outputVal; | ||
206 | __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, | ||
207 | outputVal2; | ||
208 | |||
209 | 2 | unsigned int thirtysecondPoints = num_points / 32; | |
210 | |||
211 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < thirtysecondPoints; number++) { |
212 | |||
213 | 8190 | complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
214 | 8190 | complexVectorPtr += 32; | |
215 | 8190 | complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
216 | 8190 | complexVectorPtr += 32; | |
217 | |||
218 | 8190 | complexVal1H = _mm256_extractf128_si256(complexVal1, 1); | |
219 | 8190 | complexVal1L = _mm256_extractf128_si256(complexVal1, 0); | |
220 | 8190 | complexVal2H = _mm256_extractf128_si256(complexVal2, 1); | |
221 | 8190 | complexVal2L = _mm256_extractf128_si256(complexVal2, 0); | |
222 | |||
223 | 8190 | complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH); | |
224 | 8190 | complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL); | |
225 | 8190 | outputVal1 = _mm_or_si128(complexVal1H, complexVal1L); | |
226 | |||
227 | |||
228 | 8190 | complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH); | |
229 | 8190 | complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL); | |
230 | 8190 | outputVal2 = _mm_or_si128(complexVal2H, complexVal2L); | |
231 | |||
232 | 8190 | __m256i dummy = _mm256_setzero_si256(); | |
233 | 8190 | outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0); | |
234 | 8190 | outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1); | |
235 | |||
236 | |||
237 | _mm256_store_si256((__m256i*)iBufferPtr, outputVal); | ||
238 | 8190 | iBufferPtr += 32; | |
239 | } | ||
240 | |||
241 | 2 | number = thirtysecondPoints * 32; | |
242 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
243 | 62 | *iBufferPtr++ = *complexVectorPtr++; | |
244 | 62 | complexVectorPtr++; | |
245 | } | ||
246 | 2 | } | |
247 | #endif /* LV_HAVE_AVX */ | ||
248 | |||
249 | |||
250 | #ifdef LV_HAVE_GENERIC | ||
251 | |||
252 | 2 | static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, | |
253 | const lv_8sc_t* complexVector, | ||
254 | unsigned int num_points) | ||
255 | { | ||
256 | 2 | unsigned int number = 0; | |
257 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
258 | 2 | int8_t* iBufferPtr = iBuffer; | |
259 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
260 | 262142 | *iBufferPtr++ = *complexVectorPtr++; | |
261 | 262142 | complexVectorPtr++; | |
262 | } | ||
263 | 2 | } | |
264 | #endif /* LV_HAVE_GENERIC */ | ||
265 | |||
266 | |||
267 | #ifdef LV_HAVE_NEON | ||
268 | #include <arm_neon.h> | ||
269 | |||
270 | static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, | ||
271 | const lv_8sc_t* complexVector, | ||
272 | unsigned int num_points) | ||
273 | { | ||
274 | unsigned int number; | ||
275 | unsigned int sixteenth_points = num_points / 16; | ||
276 | |||
277 | int8x16x2_t input_vector; | ||
278 | for (number = 0; number < sixteenth_points; ++number) { | ||
279 | input_vector = vld2q_s8((int8_t*)complexVector); | ||
280 | vst1q_s8(iBuffer, input_vector.val[0]); | ||
281 | iBuffer += 16; | ||
282 | complexVector += 16; | ||
283 | } | ||
284 | |||
285 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | ||
286 | int8_t* iBufferPtr = iBuffer; | ||
287 | for (number = sixteenth_points * 16; number < num_points; number++) { | ||
288 | *iBufferPtr++ = *complexVectorPtr++; | ||
289 | complexVectorPtr++; | ||
290 | } | ||
291 | } | ||
292 | #endif /* LV_HAVE_NEON */ | ||
293 | |||
294 | |||
295 | #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ | ||
296 | |||
297 | #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H | ||
298 | #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H | ||
299 | |||
300 | #include <inttypes.h> | ||
301 | #include <stdio.h> | ||
302 | |||
303 | #ifdef LV_HAVE_AVX2 | ||
304 | #include <immintrin.h> | ||
305 | |||
306 | 2 | static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, | |
307 | const lv_8sc_t* complexVector, | ||
308 | unsigned int num_points) | ||
309 | { | ||
310 | 2 | unsigned int number = 0; | |
311 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
312 | 2 | int8_t* iBufferPtr = iBuffer; | |
313 | 2 | __m256i moveMask1 = _mm256_set_epi8(0x80, | |
314 | 0x80, | ||
315 | 0x80, | ||
316 | 0x80, | ||
317 | 0x80, | ||
318 | 0x80, | ||
319 | 0x80, | ||
320 | 0x80, | ||
321 | 14, | ||
322 | 12, | ||
323 | 10, | ||
324 | 8, | ||
325 | 6, | ||
326 | 4, | ||
327 | 2, | ||
328 | 0, | ||
329 | 0x80, | ||
330 | 0x80, | ||
331 | 0x80, | ||
332 | 0x80, | ||
333 | 0x80, | ||
334 | 0x80, | ||
335 | 0x80, | ||
336 | 0x80, | ||
337 | 14, | ||
338 | 12, | ||
339 | 10, | ||
340 | 8, | ||
341 | 6, | ||
342 | 4, | ||
343 | 2, | ||
344 | 0); | ||
345 | 2 | __m256i moveMask2 = _mm256_set_epi8(14, | |
346 | 12, | ||
347 | 10, | ||
348 | 8, | ||
349 | 6, | ||
350 | 4, | ||
351 | 2, | ||
352 | 0, | ||
353 | 0x80, | ||
354 | 0x80, | ||
355 | 0x80, | ||
356 | 0x80, | ||
357 | 0x80, | ||
358 | 0x80, | ||
359 | 0x80, | ||
360 | 0x80, | ||
361 | 14, | ||
362 | 12, | ||
363 | 10, | ||
364 | 8, | ||
365 | 6, | ||
366 | 4, | ||
367 | 2, | ||
368 | 0, | ||
369 | 0x80, | ||
370 | 0x80, | ||
371 | 0x80, | ||
372 | 0x80, | ||
373 | 0x80, | ||
374 | 0x80, | ||
375 | 0x80, | ||
376 | 0x80); | ||
377 | __m256i complexVal1, complexVal2, outputVal; | ||
378 | |||
379 | 2 | unsigned int thirtysecondPoints = num_points / 32; | |
380 | |||
381 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < thirtysecondPoints; number++) { |
382 | |||
383 | 8190 | complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
384 | 8190 | complexVectorPtr += 32; | |
385 | 8190 | complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
386 | 8190 | complexVectorPtr += 32; | |
387 | |||
388 | 8190 | complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); | |
389 | 8190 | complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); | |
390 | 8190 | outputVal = _mm256_or_si256(complexVal1, complexVal2); | |
391 | 8190 | outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); | |
392 | |||
393 | _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); | ||
394 | 8190 | iBufferPtr += 32; | |
395 | } | ||
396 | |||
397 | 2 | number = thirtysecondPoints * 32; | |
398 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
399 | 62 | *iBufferPtr++ = *complexVectorPtr++; | |
400 | 62 | complexVectorPtr++; | |
401 | } | ||
402 | 2 | } | |
403 | #endif /* LV_HAVE_AVX2 */ | ||
404 | |||
405 | #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */ | ||
406 |