Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16ic_deinterleave_real_8i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Deinterleaves the complex 16 bit vector and returns the real | ||
16 | * (inphase) part of the signal as an 8-bit value. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, | ||
21 | * unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li complexVector: The complex input vector. | ||
25 | * \li num_points: The number of complex data values to be deinterleaved. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li iBuffer: The I buffer output data with 8-bit precision. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10000; | ||
33 | * | ||
34 | * volk_16ic_deinterleave_real_8i(); | ||
35 | * | ||
36 | * volk_free(x); | ||
37 | * volk_free(t); | ||
38 | * \endcode | ||
39 | */ | ||
40 | |||
41 | #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H | ||
42 | #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H | ||
43 | |||
44 | #include <inttypes.h> | ||
45 | #include <stdio.h> | ||
46 | |||
47 | |||
48 | #ifdef LV_HAVE_AVX2 | ||
49 | #include <immintrin.h> | ||
50 | |||
51 | 2 | static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, | |
52 | const lv_16sc_t* complexVector, | ||
53 | unsigned int num_points) | ||
54 | { | ||
55 | 2 | unsigned int number = 0; | |
56 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
57 | 2 | int8_t* iBufferPtr = iBuffer; | |
58 | 2 | __m256i iMoveMask1 = _mm256_set_epi8(0x80, | |
59 | 0x80, | ||
60 | 0x80, | ||
61 | 0x80, | ||
62 | 0x80, | ||
63 | 0x80, | ||
64 | 0x80, | ||
65 | 0x80, | ||
66 | 13, | ||
67 | 12, | ||
68 | 9, | ||
69 | 8, | ||
70 | 5, | ||
71 | 4, | ||
72 | 1, | ||
73 | 0, | ||
74 | 0x80, | ||
75 | 0x80, | ||
76 | 0x80, | ||
77 | 0x80, | ||
78 | 0x80, | ||
79 | 0x80, | ||
80 | 0x80, | ||
81 | 0x80, | ||
82 | 13, | ||
83 | 12, | ||
84 | 9, | ||
85 | 8, | ||
86 | 5, | ||
87 | 4, | ||
88 | 1, | ||
89 | 0); | ||
90 | 2 | __m256i iMoveMask2 = _mm256_set_epi8(13, | |
91 | 12, | ||
92 | 9, | ||
93 | 8, | ||
94 | 5, | ||
95 | 4, | ||
96 | 1, | ||
97 | 0, | ||
98 | 0x80, | ||
99 | 0x80, | ||
100 | 0x80, | ||
101 | 0x80, | ||
102 | 0x80, | ||
103 | 0x80, | ||
104 | 0x80, | ||
105 | 0x80, | ||
106 | 13, | ||
107 | 12, | ||
108 | 9, | ||
109 | 8, | ||
110 | 5, | ||
111 | 4, | ||
112 | 1, | ||
113 | 0, | ||
114 | 0x80, | ||
115 | 0x80, | ||
116 | 0x80, | ||
117 | 0x80, | ||
118 | 0x80, | ||
119 | 0x80, | ||
120 | 0x80, | ||
121 | 0x80); | ||
122 | __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; | ||
123 | |||
124 | 2 | unsigned int thirtysecondPoints = num_points / 32; | |
125 | |||
126 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < thirtysecondPoints; number++) { |
127 | 8190 | complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
128 | 8190 | complexVectorPtr += 32; | |
129 | 8190 | complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
130 | 8190 | complexVectorPtr += 32; | |
131 | |||
132 | 8190 | complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
133 | 8190 | complexVectorPtr += 32; | |
134 | 8190 | complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); | |
135 | 8190 | complexVectorPtr += 32; | |
136 | |||
137 | 8190 | complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); | |
138 | 8190 | complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); | |
139 | |||
140 | 8190 | complexVal1 = _mm256_or_si256(complexVal1, complexVal2); | |
141 | 8190 | complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); | |
142 | |||
143 | 8190 | complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); | |
144 | 8190 | complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); | |
145 | |||
146 | 8190 | complexVal3 = _mm256_or_si256(complexVal3, complexVal4); | |
147 | 8190 | complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); | |
148 | |||
149 | 8190 | complexVal1 = _mm256_srai_epi16(complexVal1, 8); | |
150 | 8190 | complexVal3 = _mm256_srai_epi16(complexVal3, 8); | |
151 | |||
152 | 8190 | iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); | |
153 | 8190 | iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); | |
154 | |||
155 | _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); | ||
156 | |||
157 | 8190 | iBufferPtr += 32; | |
158 | } | ||
159 | |||
160 | 2 | number = thirtysecondPoints * 32; | |
161 | 2 | int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; | |
162 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
163 | 62 | *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); | |
164 | 62 | int16ComplexVectorPtr++; | |
165 | } | ||
166 | 2 | } | |
167 | #endif /* LV_HAVE_AVX2 */ | ||
168 | |||
169 | |||
170 | #ifdef LV_HAVE_SSSE3 | ||
171 | #include <tmmintrin.h> | ||
172 | |||
173 | 2 | static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, | |
174 | const lv_16sc_t* complexVector, | ||
175 | unsigned int num_points) | ||
176 | { | ||
177 | 2 | unsigned int number = 0; | |
178 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
179 | 2 | int8_t* iBufferPtr = iBuffer; | |
180 | 2 | __m128i iMoveMask1 = _mm_set_epi8( | |
181 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); | ||
182 | 2 | __m128i iMoveMask2 = _mm_set_epi8( | |
183 | 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); | ||
184 | __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; | ||
185 | |||
186 | 2 | unsigned int sixteenthPoints = num_points / 16; | |
187 | |||
188 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (number = 0; number < sixteenthPoints; number++) { |
189 | 16382 | complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); | |
190 | 16382 | complexVectorPtr += 16; | |
191 | 16382 | complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); | |
192 | 16382 | complexVectorPtr += 16; | |
193 | |||
194 | 16382 | complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); | |
195 | 16382 | complexVectorPtr += 16; | |
196 | 16382 | complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); | |
197 | 16382 | complexVectorPtr += 16; | |
198 | |||
199 | 16382 | complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); | |
200 | 16382 | complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); | |
201 | |||
202 | 16382 | complexVal1 = _mm_or_si128(complexVal1, complexVal2); | |
203 | |||
204 | 16382 | complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); | |
205 | 16382 | complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); | |
206 | |||
207 | 16382 | complexVal3 = _mm_or_si128(complexVal3, complexVal4); | |
208 | |||
209 | |||
210 | 16382 | complexVal1 = _mm_srai_epi16(complexVal1, 8); | |
211 | 16382 | complexVal3 = _mm_srai_epi16(complexVal3, 8); | |
212 | |||
213 | 16382 | iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); | |
214 | |||
215 | _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); | ||
216 | |||
217 | 16382 | iBufferPtr += 16; | |
218 | } | ||
219 | |||
220 | 2 | number = sixteenthPoints * 16; | |
221 | 2 | int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; | |
222 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
223 | 30 | *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); | |
224 | 30 | int16ComplexVectorPtr++; | |
225 | } | ||
226 | 2 | } | |
227 | #endif /* LV_HAVE_SSSE3 */ | ||
228 | |||
229 | #ifdef LV_HAVE_GENERIC | ||
230 | |||
231 | 2 | static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, | |
232 | const lv_16sc_t* complexVector, | ||
233 | unsigned int num_points) | ||
234 | { | ||
235 | 2 | unsigned int number = 0; | |
236 | 2 | int16_t* complexVectorPtr = (int16_t*)complexVector; | |
237 | 2 | int8_t* iBufferPtr = iBuffer; | |
238 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
239 | 262142 | *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); | |
240 | 262142 | complexVectorPtr++; | |
241 | } | ||
242 | 2 | } | |
243 | #endif /* LV_HAVE_GENERIC */ | ||
244 | |||
245 | #ifdef LV_HAVE_NEON | ||
246 | #include <arm_neon.h> | ||
247 | |||
248 | static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, | ||
249 | const lv_16sc_t* complexVector, | ||
250 | unsigned int num_points) | ||
251 | { | ||
252 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | ||
253 | int8_t* iBufferPtr = iBuffer; | ||
254 | unsigned int eighth_points = num_points / 8; | ||
255 | unsigned int number; | ||
256 | |||
257 | int16x8x2_t complexInput; | ||
258 | int8x8_t realOutput; | ||
259 | for (number = 0; number < eighth_points; number++) { | ||
260 | complexInput = vld2q_s16(complexVectorPtr); | ||
261 | realOutput = vshrn_n_s16(complexInput.val[0], 8); | ||
262 | vst1_s8(iBufferPtr, realOutput); | ||
263 | complexVectorPtr += 16; | ||
264 | iBufferPtr += 8; | ||
265 | } | ||
266 | |||
267 | for (number = eighth_points * 8; number < num_points; number++) { | ||
268 | *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); | ||
269 | complexVectorPtr++; | ||
270 | } | ||
271 | } | ||
272 | #endif | ||
273 | |||
274 | #ifdef LV_HAVE_ORC | ||
275 | |||
276 | extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, | ||
277 | const lv_16sc_t* complexVector, | ||
278 | unsigned int num_points); | ||
279 | |||
280 | 2 | static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, | |
281 | const lv_16sc_t* complexVector, | ||
282 | unsigned int num_points) | ||
283 | { | ||
284 | 2 | volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); | |
285 | 2 | } | |
286 | #endif /* LV_HAVE_ORC */ | ||
287 | |||
288 | |||
289 | #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */ | ||
290 | |||
291 | #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H | ||
292 | #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H | ||
293 | |||
294 | #include <inttypes.h> | ||
295 | #include <stdio.h> | ||
296 | |||
297 | |||
298 | #ifdef LV_HAVE_AVX2 | ||
299 | #include <immintrin.h> | ||
300 | |||
301 | 2 | static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, | |
302 | const lv_16sc_t* complexVector, | ||
303 | unsigned int num_points) | ||
304 | { | ||
305 | 2 | unsigned int number = 0; | |
306 | 2 | const int8_t* complexVectorPtr = (int8_t*)complexVector; | |
307 | 2 | int8_t* iBufferPtr = iBuffer; | |
308 | 2 | __m256i iMoveMask1 = _mm256_set_epi8(0x80, | |
309 | 0x80, | ||
310 | 0x80, | ||
311 | 0x80, | ||
312 | 0x80, | ||
313 | 0x80, | ||
314 | 0x80, | ||
315 | 0x80, | ||
316 | 13, | ||
317 | 12, | ||
318 | 9, | ||
319 | 8, | ||
320 | 5, | ||
321 | 4, | ||
322 | 1, | ||
323 | 0, | ||
324 | 0x80, | ||
325 | 0x80, | ||
326 | 0x80, | ||
327 | 0x80, | ||
328 | 0x80, | ||
329 | 0x80, | ||
330 | 0x80, | ||
331 | 0x80, | ||
332 | 13, | ||
333 | 12, | ||
334 | 9, | ||
335 | 8, | ||
336 | 5, | ||
337 | 4, | ||
338 | 1, | ||
339 | 0); | ||
340 | 2 | __m256i iMoveMask2 = _mm256_set_epi8(13, | |
341 | 12, | ||
342 | 9, | ||
343 | 8, | ||
344 | 5, | ||
345 | 4, | ||
346 | 1, | ||
347 | 0, | ||
348 | 0x80, | ||
349 | 0x80, | ||
350 | 0x80, | ||
351 | 0x80, | ||
352 | 0x80, | ||
353 | 0x80, | ||
354 | 0x80, | ||
355 | 0x80, | ||
356 | 13, | ||
357 | 12, | ||
358 | 9, | ||
359 | 8, | ||
360 | 5, | ||
361 | 4, | ||
362 | 1, | ||
363 | 0, | ||
364 | 0x80, | ||
365 | 0x80, | ||
366 | 0x80, | ||
367 | 0x80, | ||
368 | 0x80, | ||
369 | 0x80, | ||
370 | 0x80, | ||
371 | 0x80); | ||
372 | __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; | ||
373 | |||
374 | 2 | unsigned int thirtysecondPoints = num_points / 32; | |
375 | |||
376 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (number = 0; number < thirtysecondPoints; number++) { |
377 | 8190 | complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
378 | 8190 | complexVectorPtr += 32; | |
379 | 8190 | complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
380 | 8190 | complexVectorPtr += 32; | |
381 | |||
382 | 8190 | complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
383 | 8190 | complexVectorPtr += 32; | |
384 | 8190 | complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
385 | 8190 | complexVectorPtr += 32; | |
386 | |||
387 | 8190 | complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); | |
388 | 8190 | complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); | |
389 | |||
390 | 8190 | complexVal1 = _mm256_or_si256(complexVal1, complexVal2); | |
391 | 8190 | complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); | |
392 | |||
393 | 8190 | complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); | |
394 | 8190 | complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); | |
395 | |||
396 | 8190 | complexVal3 = _mm256_or_si256(complexVal3, complexVal4); | |
397 | 8190 | complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); | |
398 | |||
399 | 8190 | complexVal1 = _mm256_srai_epi16(complexVal1, 8); | |
400 | 8190 | complexVal3 = _mm256_srai_epi16(complexVal3, 8); | |
401 | |||
402 | 8190 | iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); | |
403 | 8190 | iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); | |
404 | |||
405 | _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); | ||
406 | |||
407 | 8190 | iBufferPtr += 32; | |
408 | } | ||
409 | |||
410 | 2 | number = thirtysecondPoints * 32; | |
411 | 2 | int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; | |
412 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
413 | 62 | *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); | |
414 | 62 | int16ComplexVectorPtr++; | |
415 | } | ||
416 | 2 | } | |
417 | #endif /* LV_HAVE_AVX2 */ | ||
418 | #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */ | ||
419 |