Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16ic_deinterleave_16i_x2 | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Deinterleaves the complex 16 bit vector into I & Q vector data. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* | ||
20 | * complexVector, unsigned int num_points) \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li complexVector: The complex input vector. | ||
24 | * \li num_points: The number of complex data values to be deinterleaved. | ||
25 | * | ||
26 | * \b Outputs | ||
27 | * \li iBuffer: The I buffer output data. | ||
28 | * \li qBuffer: The Q buffer output data. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10000; | ||
33 | * | ||
34 | * volk_16ic_deinterleave_16i_x2(); | ||
35 | * | ||
36 | * volk_free(x); | ||
37 | * volk_free(t); | ||
38 | * \endcode | ||
39 | */ | ||
40 | |||
41 | #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H | ||
42 | #define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H | ||
43 | |||
44 | #include <inttypes.h> | ||
45 | #include <stdio.h> | ||
46 | #ifdef LV_HAVE_AVX2 | ||
47 | #include <immintrin.h> | ||
48 | |||
49 | 2 | static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, | |
50 | int16_t* qBuffer, | ||
51 | const lv_16sc_t* complexVector, | ||
52 | unsigned int num_points) | ||
53 | { | ||
54 | 2 | unsigned int number = 0; | |
55 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
56 | 2 | int16_t* iBufferPtr = iBuffer; | |
57 | 2 | int16_t* qBufferPtr = qBuffer; | |
58 | |||
59 | 2 | __m256i MoveMask = _mm256_set_epi8(15, | |
60 | 14, | ||
61 | 11, | ||
62 | 10, | ||
63 | 7, | ||
64 | 6, | ||
65 | 3, | ||
66 | 2, | ||
67 | 13, | ||
68 | 12, | ||
69 | 9, | ||
70 | 8, | ||
71 | 5, | ||
72 | 4, | ||
73 | 1, | ||
74 | 0, | ||
75 | 15, | ||
76 | 14, | ||
77 | 11, | ||
78 | 10, | ||
79 | 7, | ||
80 | 6, | ||
81 | 3, | ||
82 | 2, | ||
83 | 13, | ||
84 | 12, | ||
85 | 9, | ||
86 | 8, | ||
87 | 5, | ||
88 | 4, | ||
89 | 1, | ||
90 | 0); | ||
91 | |||
92 | __m256i iMove2, iMove1; | ||
93 | __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; | ||
94 | |||
95 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
96 | |||
97 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
98 | 16382 | complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
99 | 16382 | complexVectorPtr += 32; | |
100 | 16382 | complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
101 | 16382 | complexVectorPtr += 32; | |
102 | |||
103 | 16382 | iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); | |
104 | 16382 | iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); | |
105 | |||
106 | 16382 | iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), | |
107 | _mm256_permute4x64_epi64(iMove2, 0x80), | ||
108 | 0x30); | ||
109 | 16382 | qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), | |
110 | _mm256_permute4x64_epi64(iMove2, 0xd0), | ||
111 | 0x30); | ||
112 | |||
113 | _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); | ||
114 | _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); | ||
115 | |||
116 | 16382 | iBufferPtr += 16; | |
117 | 16382 | qBufferPtr += 16; | |
118 | } | ||
119 | |||
120 | 2 | number = sixteenthPoints * 16; | |
121 | 2 | int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; | |
122 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
123 | 30 | *iBufferPtr++ = *int16ComplexVectorPtr++; | |
124 | 30 | *qBufferPtr++ = *int16ComplexVectorPtr++; | |
125 | } | ||
126 | 2 | } | |
127 | #endif /* LV_HAVE_AVX2 */ | ||
128 | |||
129 | #ifdef LV_HAVE_SSSE3 | ||
130 | #include <tmmintrin.h> | ||
131 | |||
132 | 2 | static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, | |
133 | int16_t* qBuffer, | ||
134 | const lv_16sc_t* complexVector, | ||
135 | unsigned int num_points) | ||
136 | { | ||
137 | 2 | unsigned int number = 0; | |
138 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
139 | 2 | int16_t* iBufferPtr = iBuffer; | |
140 | 2 | int16_t* qBufferPtr = qBuffer; | |
141 | |||
142 | 2 | __m128i iMoveMask1 = _mm_set_epi8( | |
143 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); | ||
144 | 2 | __m128i iMoveMask2 = _mm_set_epi8( | |
145 | 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); | ||
146 | |||
147 | 2 | __m128i qMoveMask1 = _mm_set_epi8( | |
148 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); | ||
149 | 2 | __m128i qMoveMask2 = _mm_set_epi8( | |
150 | 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); | ||
151 | |||
152 | __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; | ||
153 | |||
154 | 2 | unsigned int eighthPoints = num_points / 8; | |
155 | |||
156 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < eighthPoints; number++) { |
157 | 32766 | complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); | |
158 | 32766 | complexVectorPtr += 16; | |
159 | 32766 | complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); | |
160 | 32766 | complexVectorPtr += 16; | |
161 | |||
162 | 98298 | iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1), | |
163 | _mm_shuffle_epi8(complexVal2, iMoveMask2)); | ||
164 | 98298 | qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1), | |
165 | _mm_shuffle_epi8(complexVal2, qMoveMask2)); | ||
166 | |||
167 | _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); | ||
168 | _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); | ||
169 | |||
170 | 32766 | iBufferPtr += 8; | |
171 | 32766 | qBufferPtr += 8; | |
172 | } | ||
173 | |||
174 | 2 | number = eighthPoints * 8; | |
175 | 2 | int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; | |
176 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
177 | 14 | *iBufferPtr++ = *int16ComplexVectorPtr++; | |
178 | 14 | *qBufferPtr++ = *int16ComplexVectorPtr++; | |
179 | } | ||
180 | 2 | } | |
181 | #endif /* LV_HAVE_SSSE3 */ | ||
182 | |||
183 | #ifdef LV_HAVE_SSE2 | ||
184 | #include <emmintrin.h> | ||
185 | |||
186 | 2 | static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, | |
187 | int16_t* qBuffer, | ||
188 | const lv_16sc_t* complexVector, | ||
189 | unsigned int num_points) | ||
190 | { | ||
191 | 2 | unsigned int number = 0; | |
192 | 2 | const int16_t* complexVectorPtr = (int16_t*)complexVector; | |
193 | 2 | int16_t* iBufferPtr = iBuffer; | |
194 | 2 | int16_t* qBufferPtr = qBuffer; | |
195 | __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, | ||
196 | qComplexVal2, iOutputVal, qOutputVal; | ||
197 | 2 | __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); | |
198 | 2 | __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); | |
199 | |||
200 | 2 | unsigned int eighthPoints = num_points / 8; | |
201 | |||
202 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < eighthPoints; number++) { |
203 | 32766 | complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); | |
204 | 32766 | complexVectorPtr += 8; | |
205 | 32766 | complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); | |
206 | 32766 | complexVectorPtr += 8; | |
207 | |||
208 | 32766 | iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); | |
209 | |||
210 | 32766 | iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); | |
211 | |||
212 | 32766 | iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); | |
213 | |||
214 | 32766 | iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); | |
215 | |||
216 | 32766 | iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0)); | |
217 | |||
218 | 32766 | iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); | |
219 | |||
220 | 98298 | iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), | |
221 | _mm_and_si128(iComplexVal2, highMask)); | ||
222 | |||
223 | _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); | ||
224 | |||
225 | 32766 | qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1)); | |
226 | |||
227 | 32766 | qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1)); | |
228 | |||
229 | 32766 | qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); | |
230 | |||
231 | 32766 | qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); | |
232 | |||
233 | 32766 | qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); | |
234 | |||
235 | 32766 | qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); | |
236 | |||
237 | 98298 | qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), | |
238 | _mm_and_si128(qComplexVal2, highMask)); | ||
239 | |||
240 | _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); | ||
241 | |||
242 | 32766 | iBufferPtr += 8; | |
243 | 32766 | qBufferPtr += 8; | |
244 | } | ||
245 | |||
246 | 2 | number = eighthPoints * 8; | |
247 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
248 | 14 | *iBufferPtr++ = *complexVectorPtr++; | |
249 | 14 | *qBufferPtr++ = *complexVectorPtr++; | |
250 | } | ||
251 | 2 | } | |
252 | #endif /* LV_HAVE_SSE2 */ | ||
253 | |||
254 | #ifdef LV_HAVE_GENERIC | ||
255 | |||
256 | 2 | static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, | |
257 | int16_t* qBuffer, | ||
258 | const lv_16sc_t* complexVector, | ||
259 | unsigned int num_points) | ||
260 | { | ||
261 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
262 | 2 | int16_t* iBufferPtr = iBuffer; | |
263 | 2 | int16_t* qBufferPtr = qBuffer; | |
264 | unsigned int number; | ||
265 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
266 | 262142 | *iBufferPtr++ = *complexVectorPtr++; | |
267 | 262142 | *qBufferPtr++ = *complexVectorPtr++; | |
268 | } | ||
269 | 2 | } | |
270 | #endif /* LV_HAVE_GENERIC */ | ||
271 | |||
272 | #ifdef LV_HAVE_ORC | ||
273 | |||
274 | extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, | ||
275 | int16_t* qBuffer, | ||
276 | const lv_16sc_t* complexVector, | ||
277 | unsigned int num_points); | ||
278 | 2 | static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, | |
279 | int16_t* qBuffer, | ||
280 | const lv_16sc_t* complexVector, | ||
281 | unsigned int num_points) | ||
282 | { | ||
283 | 2 | volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); | |
284 | 2 | } | |
285 | #endif /* LV_HAVE_ORC */ | ||
286 | |||
287 | #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */ | ||
288 | |||
289 | |||
290 | #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H | ||
291 | #define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H | ||
292 | |||
293 | #include <inttypes.h> | ||
294 | #include <stdio.h> | ||
295 | #ifdef LV_HAVE_AVX2 | ||
296 | #include <immintrin.h> | ||
297 | |||
298 | 2 | static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, | |
299 | int16_t* qBuffer, | ||
300 | const lv_16sc_t* complexVector, | ||
301 | unsigned int num_points) | ||
302 | { | ||
303 | 2 | unsigned int number = 0; | |
304 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
305 | 2 | int16_t* iBufferPtr = iBuffer; | |
306 | 2 | int16_t* qBufferPtr = qBuffer; | |
307 | |||
308 | 2 | __m256i MoveMask = _mm256_set_epi8(15, | |
309 | 14, | ||
310 | 11, | ||
311 | 10, | ||
312 | 7, | ||
313 | 6, | ||
314 | 3, | ||
315 | 2, | ||
316 | 13, | ||
317 | 12, | ||
318 | 9, | ||
319 | 8, | ||
320 | 5, | ||
321 | 4, | ||
322 | 1, | ||
323 | 0, | ||
324 | 15, | ||
325 | 14, | ||
326 | 11, | ||
327 | 10, | ||
328 | 7, | ||
329 | 6, | ||
330 | 3, | ||
331 | 2, | ||
332 | 13, | ||
333 | 12, | ||
334 | 9, | ||
335 | 8, | ||
336 | 5, | ||
337 | 4, | ||
338 | 1, | ||
339 | 0); | ||
340 | |||
341 | __m256i iMove2, iMove1; | ||
342 | __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; | ||
343 | |||
344 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
345 | |||
346 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
347 | 16382 | complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
348 | 16382 | complexVectorPtr += 32; | |
349 | 16382 | complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
350 | 16382 | complexVectorPtr += 32; | |
351 | |||
352 | 16382 | iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); | |
353 | 16382 | iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); | |
354 | |||
355 | 16382 | iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), | |
356 | _mm256_permute4x64_epi64(iMove2, 0x80), | ||
357 | 0x30); | ||
358 | 16382 | qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), | |
359 | _mm256_permute4x64_epi64(iMove2, 0xd0), | ||
360 | 0x30); | ||
361 | |||
362 | _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); | ||
363 | _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); | ||
364 | |||
365 | 16382 | iBufferPtr += 16; | |
366 | 16382 | qBufferPtr += 16; | |
367 | } | ||
368 | |||
369 | 2 | number = sixteenthPoints * 16; | |
370 | 2 | int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; | |
371 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
372 | 30 | *iBufferPtr++ = *int16ComplexVectorPtr++; | |
373 | 30 | *qBufferPtr++ = *int16ComplexVectorPtr++; | |
374 | } | ||
375 | 2 | } | |
376 | #endif /* LV_HAVE_AVX2 */ | ||
377 | |||
378 | #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */ | ||
379 |