Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_8ic_s32f_deinterleave_32f_x2 | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Deinterleaves the complex 8-bit char vector into I & Q vector data, | ||
16 | * converts them to floats, and divides the results by the scalar | ||
17 | * factor. | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* | ||
22 | * complexVector, const float scalar, unsigned int num_points) \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li complexVector: The complex input vector. | ||
26 | * \li scalar: The scalar value used to divide the floating point results. | ||
27 | * \li num_points: The number of complex data values to be deinterleaved. | ||
28 | * | ||
29 | * \b Outputs | ||
30 | * \li iBuffer: The I buffer output data. | ||
31 | * \li qBuffer: The Q buffer output data. | ||
32 | * | ||
33 | * \b Example | ||
34 | * \code | ||
35 | * int N = 10000; | ||
36 | * | ||
37 | * volk_8ic_s32f_deinterleave_32f_x2(); | ||
38 | * | ||
39 | * volk_free(x); | ||
40 | * \endcode | ||
41 | */ | ||
42 | |||
43 | #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H | ||
44 | #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H | ||
45 | |||
46 | #include <inttypes.h> | ||
47 | #include <stdio.h> | ||
48 | #include <volk/volk_common.h> | ||
49 | |||
50 | |||
51 | #ifdef LV_HAVE_SSE4_1 | ||
52 | #include <smmintrin.h> | ||
53 | |||
54 | static inline void | ||
55 | 2 | volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, | |
56 | float* qBuffer, | ||
57 | const lv_8sc_t* complexVector, | ||
58 | const float scalar, | ||
59 | unsigned int num_points) | ||
60 | { | ||
61 | 2 | float* iBufferPtr = iBuffer; | |
62 | 2 | float* qBufferPtr = qBuffer; | |
63 | |||
64 | 2 | unsigned int number = 0; | |
65 | 2 | const unsigned int eighthPoints = num_points / 8; | |
66 | __m128 iFloatValue, qFloatValue; | ||
67 | |||
68 | 2 | const float iScalar = 1.0 / scalar; | |
69 | 2 | __m128 invScalar = _mm_set_ps1(iScalar); | |
70 | __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; | ||
71 | 2 | int8_t* complexVectorPtr = (int8_t*)complexVector; | |
72 | |||
73 | 2 | __m128i iMoveMask = _mm_set_epi8( | |
74 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); | ||
75 | 2 | __m128i qMoveMask = _mm_set_epi8( | |
76 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); | ||
77 | |||
78 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
79 | 32766 | complexVal = _mm_load_si128((__m128i*)complexVectorPtr); | |
80 | 32766 | complexVectorPtr += 16; | |
81 | 32766 | iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); | |
82 | 32766 | qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); | |
83 | |||
84 | 32766 | iIntVal = _mm_cvtepi8_epi32(iComplexVal); | |
85 | 32766 | iFloatValue = _mm_cvtepi32_ps(iIntVal); | |
86 | 32766 | iFloatValue = _mm_mul_ps(iFloatValue, invScalar); | |
87 | _mm_store_ps(iBufferPtr, iFloatValue); | ||
88 | 32766 | iBufferPtr += 4; | |
89 | |||
90 | 32766 | iComplexVal = _mm_srli_si128(iComplexVal, 4); | |
91 | |||
92 | 32766 | iIntVal = _mm_cvtepi8_epi32(iComplexVal); | |
93 | 32766 | iFloatValue = _mm_cvtepi32_ps(iIntVal); | |
94 | 32766 | iFloatValue = _mm_mul_ps(iFloatValue, invScalar); | |
95 | _mm_store_ps(iBufferPtr, iFloatValue); | ||
96 | 32766 | iBufferPtr += 4; | |
97 | |||
98 | 32766 | qIntVal = _mm_cvtepi8_epi32(qComplexVal); | |
99 | 32766 | qFloatValue = _mm_cvtepi32_ps(qIntVal); | |
100 | 32766 | qFloatValue = _mm_mul_ps(qFloatValue, invScalar); | |
101 | _mm_store_ps(qBufferPtr, qFloatValue); | ||
102 | 32766 | qBufferPtr += 4; | |
103 | |||
104 | 32766 | qComplexVal = _mm_srli_si128(qComplexVal, 4); | |
105 | |||
106 | 32766 | qIntVal = _mm_cvtepi8_epi32(qComplexVal); | |
107 | 32766 | qFloatValue = _mm_cvtepi32_ps(qIntVal); | |
108 | 32766 | qFloatValue = _mm_mul_ps(qFloatValue, invScalar); | |
109 | _mm_store_ps(qBufferPtr, qFloatValue); | ||
110 | |||
111 | 32766 | qBufferPtr += 4; | |
112 | } | ||
113 | |||
114 | 2 | number = eighthPoints * 8; | |
115 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
116 | 14 | *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; | |
117 | 14 | *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; | |
118 | } | ||
119 | 2 | } | |
120 | #endif /* LV_HAVE_SSE4_1 */ | ||
121 | |||
122 | |||
123 | #ifdef LV_HAVE_SSE | ||
124 | #include <xmmintrin.h> | ||
125 | |||
126 | 2 | static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, | |
127 | float* qBuffer, | ||
128 | const lv_8sc_t* complexVector, | ||
129 | const float scalar, | ||
130 | unsigned int num_points) | ||
131 | { | ||
132 | 2 | float* iBufferPtr = iBuffer; | |
133 | 2 | float* qBufferPtr = qBuffer; | |
134 | |||
135 | 2 | unsigned int number = 0; | |
136 | 2 | const unsigned int quarterPoints = num_points / 4; | |
137 | __m128 cplxValue1, cplxValue2, iValue, qValue; | ||
138 | |||
139 | 2 | __m128 invScalar = _mm_set_ps1(1.0 / scalar); | |
140 | 2 | int8_t* complexVectorPtr = (int8_t*)complexVector; | |
141 | |||
142 | __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; | ||
143 | |||
144 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
145 | 65534 | floatBuffer[0] = (float)(complexVectorPtr[0]); | |
146 | 65534 | floatBuffer[1] = (float)(complexVectorPtr[1]); | |
147 | 65534 | floatBuffer[2] = (float)(complexVectorPtr[2]); | |
148 | 65534 | floatBuffer[3] = (float)(complexVectorPtr[3]); | |
149 | |||
150 | 65534 | floatBuffer[4] = (float)(complexVectorPtr[4]); | |
151 | 65534 | floatBuffer[5] = (float)(complexVectorPtr[5]); | |
152 | 65534 | floatBuffer[6] = (float)(complexVectorPtr[6]); | |
153 | 65534 | floatBuffer[7] = (float)(complexVectorPtr[7]); | |
154 | |||
155 | 65534 | cplxValue1 = _mm_load_ps(&floatBuffer[0]); | |
156 | 65534 | cplxValue2 = _mm_load_ps(&floatBuffer[4]); | |
157 | |||
158 | 65534 | complexVectorPtr += 8; | |
159 | |||
160 | 65534 | cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); | |
161 | 65534 | cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); | |
162 | |||
163 | // Arrange in i1i2i3i4 format | ||
164 | 65534 | iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); | |
165 | 65534 | qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); | |
166 | |||
167 | _mm_store_ps(iBufferPtr, iValue); | ||
168 | _mm_store_ps(qBufferPtr, qValue); | ||
169 | |||
170 | 65534 | iBufferPtr += 4; | |
171 | 65534 | qBufferPtr += 4; | |
172 | } | ||
173 | |||
174 | 2 | number = quarterPoints * 4; | |
175 | 2 | complexVectorPtr = (int8_t*)&complexVector[number]; | |
176 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
177 | 6 | *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
178 | 6 | *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
179 | } | ||
180 | 2 | } | |
181 | #endif /* LV_HAVE_SSE */ | ||
182 | |||
183 | |||
184 | #ifdef LV_HAVE_AVX2 | ||
185 | #include <immintrin.h> | ||
186 | |||
187 | 2 | static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, | |
188 | float* qBuffer, | ||
189 | const lv_8sc_t* complexVector, | ||
190 | const float scalar, | ||
191 | unsigned int num_points) | ||
192 | { | ||
193 | 2 | float* iBufferPtr = iBuffer; | |
194 | 2 | float* qBufferPtr = qBuffer; | |
195 | |||
196 | 2 | unsigned int number = 0; | |
197 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
198 | __m256 iFloatValue, qFloatValue; | ||
199 | |||
200 | 2 | const float iScalar = 1.0 / scalar; | |
201 | 2 | __m256 invScalar = _mm256_set1_ps(iScalar); | |
202 | __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; | ||
203 | 2 | int8_t* complexVectorPtr = (int8_t*)complexVector; | |
204 | |||
205 | 2 | __m256i iMoveMask = _mm256_set_epi8(0x80, | |
206 | 0x80, | ||
207 | 0x80, | ||
208 | 0x80, | ||
209 | 0x80, | ||
210 | 0x80, | ||
211 | 0x80, | ||
212 | 0x80, | ||
213 | 14, | ||
214 | 12, | ||
215 | 10, | ||
216 | 8, | ||
217 | 6, | ||
218 | 4, | ||
219 | 2, | ||
220 | 0, | ||
221 | 0x80, | ||
222 | 0x80, | ||
223 | 0x80, | ||
224 | 0x80, | ||
225 | 0x80, | ||
226 | 0x80, | ||
227 | 0x80, | ||
228 | 0x80, | ||
229 | 14, | ||
230 | 12, | ||
231 | 10, | ||
232 | 8, | ||
233 | 6, | ||
234 | 4, | ||
235 | 2, | ||
236 | 0); | ||
237 | 2 | __m256i qMoveMask = _mm256_set_epi8(0x80, | |
238 | 0x80, | ||
239 | 0x80, | ||
240 | 0x80, | ||
241 | 0x80, | ||
242 | 0x80, | ||
243 | 0x80, | ||
244 | 0x80, | ||
245 | 15, | ||
246 | 13, | ||
247 | 11, | ||
248 | 9, | ||
249 | 7, | ||
250 | 5, | ||
251 | 3, | ||
252 | 1, | ||
253 | 0x80, | ||
254 | 0x80, | ||
255 | 0x80, | ||
256 | 0x80, | ||
257 | 0x80, | ||
258 | 0x80, | ||
259 | 0x80, | ||
260 | 0x80, | ||
261 | 15, | ||
262 | 13, | ||
263 | 11, | ||
264 | 9, | ||
265 | 7, | ||
266 | 5, | ||
267 | 3, | ||
268 | 1); | ||
269 | |||
270 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
271 | 16382 | complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); | |
272 | 16382 | complexVectorPtr += 32; | |
273 | 16382 | iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask); | |
274 | 16382 | qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask); | |
275 | |||
276 | 32764 | iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); | |
277 | 16382 | iFloatValue = _mm256_cvtepi32_ps(iIntVal); | |
278 | 16382 | iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); | |
279 | _mm256_store_ps(iBufferPtr, iFloatValue); | ||
280 | 16382 | iBufferPtr += 8; | |
281 | |||
282 | 16382 | iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110); | |
283 | 32764 | iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); | |
284 | 16382 | iFloatValue = _mm256_cvtepi32_ps(iIntVal); | |
285 | 16382 | iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); | |
286 | _mm256_store_ps(iBufferPtr, iFloatValue); | ||
287 | 16382 | iBufferPtr += 8; | |
288 | |||
289 | 32764 | qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); | |
290 | 16382 | qFloatValue = _mm256_cvtepi32_ps(qIntVal); | |
291 | 16382 | qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); | |
292 | _mm256_store_ps(qBufferPtr, qFloatValue); | ||
293 | 16382 | qBufferPtr += 8; | |
294 | |||
295 | 16382 | qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110); | |
296 | 32764 | qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); | |
297 | 16382 | qFloatValue = _mm256_cvtepi32_ps(qIntVal); | |
298 | 16382 | qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); | |
299 | _mm256_store_ps(qBufferPtr, qFloatValue); | ||
300 | 16382 | qBufferPtr += 8; | |
301 | } | ||
302 | |||
303 | 2 | number = sixteenthPoints * 16; | |
304 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
305 | 30 | *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; | |
306 | 30 | *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; | |
307 | } | ||
308 | 2 | } | |
309 | #endif /* LV_HAVE_AVX2 */ | ||
310 | |||
311 | |||
312 | #ifdef LV_HAVE_GENERIC | ||
313 | |||
314 | static inline void | ||
315 | 2 | volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, | |
316 | float* qBuffer, | ||
317 | const lv_8sc_t* complexVector, | ||
318 | const float scalar, | ||
319 | unsigned int num_points) | ||
320 | { | ||
321 | 2 | const int8_t* complexVectorPtr = (const int8_t*)complexVector; | |
322 | 2 | float* iBufferPtr = iBuffer; | |
323 | 2 | float* qBufferPtr = qBuffer; | |
324 | unsigned int number; | ||
325 | 2 | const float invScalar = 1.0 / scalar; | |
326 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
327 | 262142 | *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar; | |
328 | 262142 | *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar; | |
329 | } | ||
330 | 2 | } | |
331 | #endif /* LV_HAVE_GENERIC */ | ||
332 | |||
333 | |||
334 | #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */ | ||
335 | |||
336 | |||
337 | #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H | ||
338 | #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H | ||
339 | |||
340 | #include <inttypes.h> | ||
341 | #include <stdio.h> | ||
342 | #include <volk/volk_common.h> | ||
343 | |||
344 | #ifdef LV_HAVE_AVX2 | ||
345 | #include <immintrin.h> | ||
346 | |||
347 | 2 | static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, | |
348 | float* qBuffer, | ||
349 | const lv_8sc_t* complexVector, | ||
350 | const float scalar, | ||
351 | unsigned int num_points) | ||
352 | { | ||
353 | 2 | float* iBufferPtr = iBuffer; | |
354 | 2 | float* qBufferPtr = qBuffer; | |
355 | |||
356 | 2 | unsigned int number = 0; | |
357 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
358 | __m256 iFloatValue, qFloatValue; | ||
359 | |||
360 | 2 | const float iScalar = 1.0 / scalar; | |
361 | 2 | __m256 invScalar = _mm256_set1_ps(iScalar); | |
362 | __m256i complexVal, iIntVal, qIntVal; | ||
363 | __m128i iComplexVal, qComplexVal; | ||
364 | 2 | int8_t* complexVectorPtr = (int8_t*)complexVector; | |
365 | |||
366 | 2 | __m256i MoveMask = _mm256_set_epi8(15, | |
367 | 13, | ||
368 | 11, | ||
369 | 9, | ||
370 | 7, | ||
371 | 5, | ||
372 | 3, | ||
373 | 1, | ||
374 | 14, | ||
375 | 12, | ||
376 | 10, | ||
377 | 8, | ||
378 | 6, | ||
379 | 4, | ||
380 | 2, | ||
381 | 0, | ||
382 | 15, | ||
383 | 13, | ||
384 | 11, | ||
385 | 9, | ||
386 | 7, | ||
387 | 5, | ||
388 | 3, | ||
389 | 1, | ||
390 | 14, | ||
391 | 12, | ||
392 | 10, | ||
393 | 8, | ||
394 | 6, | ||
395 | 4, | ||
396 | 2, | ||
397 | 0); | ||
398 | |||
399 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
400 | 16382 | complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
401 | 16382 | complexVectorPtr += 32; | |
402 | 16382 | complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); | |
403 | 16382 | complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); | |
404 | 16382 | iComplexVal = _mm256_extractf128_si256(complexVal, 0); | |
405 | 16382 | qComplexVal = _mm256_extractf128_si256(complexVal, 1); | |
406 | |||
407 | 16382 | iIntVal = _mm256_cvtepi8_epi32(iComplexVal); | |
408 | 16382 | iFloatValue = _mm256_cvtepi32_ps(iIntVal); | |
409 | 16382 | iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); | |
410 | _mm256_storeu_ps(iBufferPtr, iFloatValue); | ||
411 | 16382 | iBufferPtr += 8; | |
412 | |||
413 | 16382 | qIntVal = _mm256_cvtepi8_epi32(qComplexVal); | |
414 | 16382 | qFloatValue = _mm256_cvtepi32_ps(qIntVal); | |
415 | 16382 | qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); | |
416 | _mm256_storeu_ps(qBufferPtr, qFloatValue); | ||
417 | 16382 | qBufferPtr += 8; | |
418 | |||
419 | 16382 | complexVal = _mm256_srli_si256(complexVal, 8); | |
420 | 16382 | iComplexVal = _mm256_extractf128_si256(complexVal, 0); | |
421 | 16382 | qComplexVal = _mm256_extractf128_si256(complexVal, 1); | |
422 | |||
423 | 16382 | iIntVal = _mm256_cvtepi8_epi32(iComplexVal); | |
424 | 16382 | iFloatValue = _mm256_cvtepi32_ps(iIntVal); | |
425 | 16382 | iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); | |
426 | _mm256_storeu_ps(iBufferPtr, iFloatValue); | ||
427 | 16382 | iBufferPtr += 8; | |
428 | |||
429 | 16382 | qIntVal = _mm256_cvtepi8_epi32(qComplexVal); | |
430 | 16382 | qFloatValue = _mm256_cvtepi32_ps(qIntVal); | |
431 | 16382 | qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); | |
432 | _mm256_storeu_ps(qBufferPtr, qFloatValue); | ||
433 | 16382 | qBufferPtr += 8; | |
434 | } | ||
435 | |||
436 | 2 | number = sixteenthPoints * 16; | |
437 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
438 | 30 | *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; | |
439 | 30 | *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; | |
440 | } | ||
441 | 2 | } | |
442 | #endif /* LV_HAVE_AVX2 */ | ||
443 | |||
444 | #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */ | ||
445 |