Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_8ic_deinterleave_16i_x2 | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Deinterleaves the complex 8-bit char vector into I & Q vector data | ||
16 | * and converts them to 16-bit shorts. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* | ||
21 | * complexVector, unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li complexVector: The complex input vector. | ||
25 | * \li num_points: The number of complex data values to be deinterleaved. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li iBuffer: The I buffer output data. | ||
29 | * \li qBuffer: The Q buffer output data. | ||
30 | * | ||
31 | * \b Example | ||
32 | * \code | ||
33 | * int N = 10000; | ||
34 | * | ||
35 | * volk_8ic_deinterleave_16i_x2(); | ||
36 | * | ||
37 | * volk_free(x); | ||
38 | * \endcode | ||
39 | */ | ||
40 | |||
41 | #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H | ||
42 | #define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H | ||
43 | |||
44 | #include <inttypes.h> | ||
45 | #include <stdio.h> | ||
46 | |||
47 | #ifdef LV_HAVE_AVX2 | ||
48 | #include <immintrin.h> | ||
49 | |||
50 | 2 | static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, | |
51 | int16_t* qBuffer, | ||
52 | const lv_8sc_t* complexVector, | ||
53 | unsigned int num_points) | ||
54 | { | ||
55 | 2 | unsigned int number = 0; | |
56 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
57 | 2 | int16_t* iBufferPtr = iBuffer; | |
58 | 2 | int16_t* qBufferPtr = qBuffer; | |
59 | 2 | __m256i MoveMask = _mm256_set_epi8(15, | |
60 | 13, | ||
61 | 11, | ||
62 | 9, | ||
63 | 7, | ||
64 | 5, | ||
65 | 3, | ||
66 | 1, | ||
67 | 14, | ||
68 | 12, | ||
69 | 10, | ||
70 | 8, | ||
71 | 6, | ||
72 | 4, | ||
73 | 2, | ||
74 | 0, | ||
75 | 15, | ||
76 | 13, | ||
77 | 11, | ||
78 | 9, | ||
79 | 7, | ||
80 | 5, | ||
81 | 3, | ||
82 | 1, | ||
83 | 14, | ||
84 | 12, | ||
85 | 10, | ||
86 | 8, | ||
87 | 6, | ||
88 | 4, | ||
89 | 2, | ||
90 | 0); | ||
91 | __m256i complexVal, iOutputVal, qOutputVal; | ||
92 | __m128i iOutputVal0, qOutputVal0; | ||
93 | |||
94 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
95 | |||
96 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
97 | 16382 | complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); | |
98 | 16382 | complexVectorPtr += 32; | |
99 | |||
100 | 16382 | complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); | |
101 | 16382 | complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); | |
102 | |||
103 | 16382 | iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); | |
104 | 16382 | qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); | |
105 | |||
106 | 16382 | iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); | |
107 | 16382 | iOutputVal = _mm256_slli_epi16(iOutputVal, 8); | |
108 | |||
109 | 16382 | qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); | |
110 | 16382 | qOutputVal = _mm256_slli_epi16(qOutputVal, 8); | |
111 | |||
112 | _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); | ||
113 | _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); | ||
114 | |||
115 | 16382 | iBufferPtr += 16; | |
116 | 16382 | qBufferPtr += 16; | |
117 | } | ||
118 | |||
119 | 2 | number = sixteenthPoints * 16; | |
120 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
121 | 30 | *iBufferPtr++ = | |
122 | 30 | ((int16_t)*complexVectorPtr++) * | |
123 | 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store | ||
124 | 30 | *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; | |
125 | } | ||
126 | 2 | } | |
127 | #endif /* LV_HAVE_AVX2 */ | ||
128 | |||
129 | #ifdef LV_HAVE_SSE4_1 | ||
130 | #include <smmintrin.h> | ||
131 | |||
132 | 2 | static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, | |
133 | int16_t* qBuffer, | ||
134 | const lv_8sc_t* complexVector, | ||
135 | unsigned int num_points) | ||
136 | { | ||
137 | 2 | unsigned int number = 0; | |
138 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
139 | 2 | int16_t* iBufferPtr = iBuffer; | |
140 | 2 | int16_t* qBufferPtr = qBuffer; | |
141 | 2 | __m128i iMoveMask = _mm_set_epi8(0x80, | |
142 | 0x80, | ||
143 | 0x80, | ||
144 | 0x80, | ||
145 | 0x80, | ||
146 | 0x80, | ||
147 | 0x80, | ||
148 | 0x80, | ||
149 | 14, | ||
150 | 12, | ||
151 | 10, | ||
152 | 8, | ||
153 | 6, | ||
154 | 4, | ||
155 | 2, | ||
156 | 0); // set 16 byte values | ||
157 | 2 | __m128i qMoveMask = _mm_set_epi8( | |
158 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); | ||
159 | __m128i complexVal, iOutputVal, qOutputVal; | ||
160 | |||
161 | 2 | unsigned int eighthPoints = num_points / 8; | |
162 | |||
163 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < eighthPoints; number++) { |
164 | 32766 | complexVal = _mm_load_si128((__m128i*)complexVectorPtr); | |
165 | 32766 | complexVectorPtr += 16; // aligned load | |
166 | |||
167 | 32766 | iOutputVal = _mm_shuffle_epi8(complexVal, | |
168 | iMoveMask); // shuffle 16 bytes of 128bit complexVal | ||
169 | 32766 | qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); | |
170 | |||
171 | 32766 | iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions | |
172 | // of lower 8 bytes of input to output | ||
173 | iOutputVal = | ||
174 | 32766 | _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 | |
175 | // 16-bit integers, shift in with zeros | ||
176 | |||
177 | 32766 | qOutputVal = _mm_cvtepi8_epi16(qOutputVal); | |
178 | 32766 | qOutputVal = _mm_slli_epi16(qOutputVal, 8); | |
179 | |||
180 | _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store | ||
181 | _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); | ||
182 | |||
183 | 32766 | iBufferPtr += 8; | |
184 | 32766 | qBufferPtr += 8; | |
185 | } | ||
186 | |||
187 | 2 | number = eighthPoints * 8; | |
188 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
189 | 14 | *iBufferPtr++ = | |
190 | 14 | ((int16_t)*complexVectorPtr++) * | |
191 | 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store | ||
192 | 14 | *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; | |
193 | } | ||
194 | 2 | } | |
195 | #endif /* LV_HAVE_SSE4_1 */ | ||
196 | |||
197 | |||
198 | #ifdef LV_HAVE_AVX | ||
199 | #include <immintrin.h> | ||
200 | |||
201 | 2 | static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, | |
202 | int16_t* qBuffer, | ||
203 | const lv_8sc_t* complexVector, | ||
204 | unsigned int num_points) | ||
205 | { | ||
206 | 2 | unsigned int number = 0; | |
207 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
208 | 2 | int16_t* iBufferPtr = iBuffer; | |
209 | 2 | int16_t* qBufferPtr = qBuffer; | |
210 | 2 | __m128i iMoveMask = _mm_set_epi8(0x80, | |
211 | 0x80, | ||
212 | 0x80, | ||
213 | 0x80, | ||
214 | 0x80, | ||
215 | 0x80, | ||
216 | 0x80, | ||
217 | 0x80, | ||
218 | 14, | ||
219 | 12, | ||
220 | 10, | ||
221 | 8, | ||
222 | 6, | ||
223 | 4, | ||
224 | 2, | ||
225 | 0); // set 16 byte values | ||
226 | 2 | __m128i qMoveMask = _mm_set_epi8( | |
227 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); | ||
228 | __m256i complexVal, iOutputVal, qOutputVal; | ||
229 | __m128i complexVal1, complexVal0; | ||
230 | __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0; | ||
231 | |||
232 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
233 | |||
234 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
235 | 16382 | complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); | |
236 | 16382 | complexVectorPtr += 32; // aligned load | |
237 | |||
238 | // Extract from complexVal to iOutputVal and qOutputVal | ||
239 | 16382 | complexVal1 = _mm256_extractf128_si256(complexVal, 1); | |
240 | 16382 | complexVal0 = _mm256_extractf128_si256(complexVal, 0); | |
241 | |||
242 | 16382 | iOutputVal1 = _mm_shuffle_epi8( | |
243 | complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal | ||
244 | 16382 | iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask); | |
245 | 16382 | qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask); | |
246 | 16382 | qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask); | |
247 | |||
248 | iOutputVal1 = | ||
249 | 16382 | _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of | |
250 | // lower 8 bytes of input to output | ||
251 | iOutputVal1 = | ||
252 | 16382 | _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 | |
253 | // 16-bit integers, shift in with zeros | ||
254 | 16382 | iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0); | |
255 | 16382 | iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8); | |
256 | |||
257 | 16382 | qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1); | |
258 | 16382 | qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8); | |
259 | 16382 | qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0); | |
260 | 16382 | qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8); | |
261 | |||
262 | // Pack iOutputVal0,1 to iOutputVal | ||
263 | 16382 | __m256i dummy = _mm256_setzero_si256(); | |
264 | 16382 | iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0); | |
265 | 16382 | iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1); | |
266 | 16382 | qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0); | |
267 | 16382 | qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1); | |
268 | |||
269 | _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store | ||
270 | _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); | ||
271 | |||
272 | 16382 | iBufferPtr += 16; | |
273 | 16382 | qBufferPtr += 16; | |
274 | } | ||
275 | |||
276 | 2 | number = sixteenthPoints * 16; | |
277 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
278 | 30 | *iBufferPtr++ = | |
279 | 30 | ((int16_t)*complexVectorPtr++) * | |
280 | 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store | ||
281 | 30 | *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; | |
282 | } | ||
283 | 2 | } | |
284 | #endif /* LV_HAVE_AVX */ | ||
285 | |||
286 | |||
287 | #ifdef LV_HAVE_GENERIC | ||
288 | |||
289 | 2 | static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, | |
290 | int16_t* qBuffer, | ||
291 | const lv_8sc_t* complexVector, | ||
292 | unsigned int num_points) | ||
293 | { | ||
294 | 2 | const int8_t* complexVectorPtr = (const int8_t*)complexVector; | |
295 | 2 | int16_t* iBufferPtr = iBuffer; | |
296 | 2 | int16_t* qBufferPtr = qBuffer; | |
297 | unsigned int number; | ||
298 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
299 | 262142 | *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256; | |
300 | 262142 | *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256; | |
301 | } | ||
302 | 2 | } | |
303 | #endif /* LV_HAVE_GENERIC */ | ||
304 | |||
305 | |||
306 | #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */ | ||
307 | |||
308 | #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H | ||
309 | #define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H | ||
310 | |||
311 | #include <inttypes.h> | ||
312 | #include <stdio.h> | ||
313 | |||
314 | #ifdef LV_HAVE_AVX2 | ||
315 | #include <immintrin.h> | ||
316 | |||
317 | 2 | static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, | |
318 | int16_t* qBuffer, | ||
319 | const lv_8sc_t* complexVector, | ||
320 | unsigned int num_points) | ||
321 | { | ||
322 | 2 | unsigned int number = 0; | |
323 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
324 | 2 | int16_t* iBufferPtr = iBuffer; | |
325 | 2 | int16_t* qBufferPtr = qBuffer; | |
326 | 2 | __m256i MoveMask = _mm256_set_epi8(15, | |
327 | 13, | ||
328 | 11, | ||
329 | 9, | ||
330 | 7, | ||
331 | 5, | ||
332 | 3, | ||
333 | 1, | ||
334 | 14, | ||
335 | 12, | ||
336 | 10, | ||
337 | 8, | ||
338 | 6, | ||
339 | 4, | ||
340 | 2, | ||
341 | 0, | ||
342 | 15, | ||
343 | 13, | ||
344 | 11, | ||
345 | 9, | ||
346 | 7, | ||
347 | 5, | ||
348 | 3, | ||
349 | 1, | ||
350 | 14, | ||
351 | 12, | ||
352 | 10, | ||
353 | 8, | ||
354 | 6, | ||
355 | 4, | ||
356 | 2, | ||
357 | 0); | ||
358 | __m256i complexVal, iOutputVal, qOutputVal; | ||
359 | __m128i iOutputVal0, qOutputVal0; | ||
360 | |||
361 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
362 | |||
363 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
364 | 16382 | complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
365 | 16382 | complexVectorPtr += 32; | |
366 | |||
367 | 16382 | complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); | |
368 | 16382 | complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); | |
369 | |||
370 | 16382 | iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); | |
371 | 16382 | qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); | |
372 | |||
373 | 16382 | iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); | |
374 | 16382 | iOutputVal = _mm256_slli_epi16(iOutputVal, 8); | |
375 | |||
376 | 16382 | qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); | |
377 | 16382 | qOutputVal = _mm256_slli_epi16(qOutputVal, 8); | |
378 | |||
379 | _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); | ||
380 | _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); | ||
381 | |||
382 | 16382 | iBufferPtr += 16; | |
383 | 16382 | qBufferPtr += 16; | |
384 | } | ||
385 | |||
386 | 2 | number = sixteenthPoints * 16; | |
387 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
388 | 30 | *iBufferPtr++ = | |
389 | 30 | ((int16_t)*complexVectorPtr++) * | |
390 | 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store | ||
391 | 30 | *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; | |
392 | } | ||
393 | 2 | } | |
394 | #endif /* LV_HAVE_AVX2 */ | ||
395 | #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */ | ||
396 |