Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16ic_s32f_deinterleave_32f_x2 | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Deinterleaves the complex 16 bit vector into I & Q vector data and | ||
16 | * returns the result as two vectors of floats that have been scaled. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const | ||
21 | * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li complexVector: The complex input vector of 16-bit shorts. | ||
25 | * \li scalar: The value to be divided against each sample of the input complex vector. | ||
26 | * \li num_points: The number of complex data values to be deinterleaved. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li iBuffer: The floating point I buffer output data. | ||
30 | * \li qBuffer: The floating point Q buffer output data. | ||
31 | * | ||
32 | * \b Example | ||
33 | * \code | ||
34 | * int N = 10000; | ||
35 | * | ||
36 | * volk_16ic_s32f_deinterleave_32f_x2(); | ||
37 | * | ||
38 | * volk_free(x); | ||
39 | * volk_free(t); | ||
40 | * \endcode | ||
41 | */ | ||
42 | |||
43 | #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H | ||
44 | #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H | ||
45 | |||
46 | #include <inttypes.h> | ||
47 | #include <stdio.h> | ||
48 | #include <volk/volk_common.h> | ||
49 | |||
50 | #ifdef LV_HAVE_AVX2 | ||
51 | #include <immintrin.h> | ||
52 | |||
53 | static inline void | ||
54 | 2 | volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, | |
55 | float* qBuffer, | ||
56 | const lv_16sc_t* complexVector, | ||
57 | const float scalar, | ||
58 | unsigned int num_points) | ||
59 | { | ||
60 | 2 | float* iBufferPtr = iBuffer; | |
61 | 2 | float* qBufferPtr = qBuffer; | |
62 | |||
63 | 2 | uint64_t number = 0; | |
64 | 2 | const uint64_t eighthPoints = num_points / 8; | |
65 | __m256 cplxValue1, cplxValue2, iValue, qValue; | ||
66 | __m256i cplxValueA, cplxValueB; | ||
67 | __m128i cplxValue128; | ||
68 | |||
69 | 2 | __m256 invScalar = _mm256_set1_ps(1.0 / scalar); | |
70 | 2 | int16_t* complexVectorPtr = (int16_t*)complexVector; | |
71 | 2 | __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
72 | |||
73 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
74 | |||
75 | 32766 | cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr); | |
76 | 32766 | complexVectorPtr += 16; | |
77 | |||
78 | // cvt | ||
79 | 32766 | cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); | |
80 | 32766 | cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); | |
81 | 32766 | cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); | |
82 | 32766 | cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); | |
83 | 32766 | cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); | |
84 | 32766 | cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); | |
85 | |||
86 | 32766 | cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); | |
87 | 32766 | cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); | |
88 | |||
89 | // Arrange in i1i2i3i4 format | ||
90 | 32766 | iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); | |
91 | 32766 | iValue = _mm256_permutevar8x32_ps(iValue, idx); | |
92 | // Arrange in q1q2q3q4 format | ||
93 | 32766 | qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); | |
94 | 32766 | qValue = _mm256_permutevar8x32_ps(qValue, idx); | |
95 | |||
96 | _mm256_store_ps(iBufferPtr, iValue); | ||
97 | _mm256_store_ps(qBufferPtr, qValue); | ||
98 | |||
99 | 32766 | iBufferPtr += 8; | |
100 | 32766 | qBufferPtr += 8; | |
101 | } | ||
102 | |||
103 | 2 | number = eighthPoints * 8; | |
104 | 2 | complexVectorPtr = (int16_t*)&complexVector[number]; | |
105 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
106 | 14 | *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
107 | 14 | *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
108 | } | ||
109 | 2 | } | |
110 | #endif /* LV_HAVE_AVX2 */ | ||
111 | |||
112 | #ifdef LV_HAVE_SSE | ||
113 | #include <xmmintrin.h> | ||
114 | |||
115 | static inline void | ||
116 | 2 | volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, | |
117 | float* qBuffer, | ||
118 | const lv_16sc_t* complexVector, | ||
119 | const float scalar, | ||
120 | unsigned int num_points) | ||
121 | { | ||
122 | 2 | float* iBufferPtr = iBuffer; | |
123 | 2 | float* qBufferPtr = qBuffer; | |
124 | |||
125 | 2 | uint64_t number = 0; | |
126 | 2 | const uint64_t quarterPoints = num_points / 4; | |
127 | __m128 cplxValue1, cplxValue2, iValue, qValue; | ||
128 | |||
129 | 2 | __m128 invScalar = _mm_set_ps1(1.0 / scalar); | |
130 | 2 | int16_t* complexVectorPtr = (int16_t*)complexVector; | |
131 | |||
132 | __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; | ||
133 | |||
134 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
135 | |||
136 | 65534 | floatBuffer[0] = (float)(complexVectorPtr[0]); | |
137 | 65534 | floatBuffer[1] = (float)(complexVectorPtr[1]); | |
138 | 65534 | floatBuffer[2] = (float)(complexVectorPtr[2]); | |
139 | 65534 | floatBuffer[3] = (float)(complexVectorPtr[3]); | |
140 | |||
141 | 65534 | floatBuffer[4] = (float)(complexVectorPtr[4]); | |
142 | 65534 | floatBuffer[5] = (float)(complexVectorPtr[5]); | |
143 | 65534 | floatBuffer[6] = (float)(complexVectorPtr[6]); | |
144 | 65534 | floatBuffer[7] = (float)(complexVectorPtr[7]); | |
145 | |||
146 | 65534 | cplxValue1 = _mm_load_ps(&floatBuffer[0]); | |
147 | 65534 | cplxValue2 = _mm_load_ps(&floatBuffer[4]); | |
148 | |||
149 | 65534 | complexVectorPtr += 8; | |
150 | |||
151 | 65534 | cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); | |
152 | 65534 | cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); | |
153 | |||
154 | // Arrange in i1i2i3i4 format | ||
155 | 65534 | iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); | |
156 | // Arrange in q1q2q3q4 format | ||
157 | 65534 | qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); | |
158 | |||
159 | _mm_store_ps(iBufferPtr, iValue); | ||
160 | _mm_store_ps(qBufferPtr, qValue); | ||
161 | |||
162 | 65534 | iBufferPtr += 4; | |
163 | 65534 | qBufferPtr += 4; | |
164 | } | ||
165 | |||
166 | 2 | number = quarterPoints * 4; | |
167 | 2 | complexVectorPtr = (int16_t*)&complexVector[number]; | |
168 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
169 | 6 | *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
170 | 6 | *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
171 | } | ||
172 | 2 | } | |
173 | #endif /* LV_HAVE_SSE */ | ||
174 | |||
175 | #ifdef LV_HAVE_GENERIC | ||
176 | |||
177 | static inline void | ||
178 | 2 | volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, | |
179 | float* qBuffer, | ||
180 | const lv_16sc_t* complexVector, | ||
181 | const float scalar, | ||
182 | unsigned int num_points) | ||
183 | { | ||
184 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
185 | 2 | float* iBufferPtr = iBuffer; | |
186 | 2 | float* qBufferPtr = qBuffer; | |
187 | unsigned int number; | ||
188 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
189 | 262142 | *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
190 | 262142 | *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
191 | } | ||
192 | 2 | } | |
193 | #endif /* LV_HAVE_GENERIC */ | ||
194 | |||
195 | #ifdef LV_HAVE_NEON | ||
196 | #include <arm_neon.h> | ||
197 | static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, | ||
198 | float* qBuffer, | ||
199 | const lv_16sc_t* complexVector, | ||
200 | const float scalar, | ||
201 | unsigned int num_points) | ||
202 | { | ||
203 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | ||
204 | float* iBufferPtr = iBuffer; | ||
205 | float* qBufferPtr = qBuffer; | ||
206 | unsigned int eighth_points = num_points / 4; | ||
207 | unsigned int number; | ||
208 | float iScalar = 1.f / scalar; | ||
209 | float32x4_t invScalar; | ||
210 | invScalar = vld1q_dup_f32(&iScalar); | ||
211 | |||
212 | int16x4x2_t complexInput_s16; | ||
213 | int32x4x2_t complexInput_s32; | ||
214 | float32x4x2_t complexFloat; | ||
215 | |||
216 | for (number = 0; number < eighth_points; number++) { | ||
217 | complexInput_s16 = vld2_s16(complexVectorPtr); | ||
218 | complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); | ||
219 | complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); | ||
220 | complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); | ||
221 | complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); | ||
222 | complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); | ||
223 | complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); | ||
224 | vst1q_f32(iBufferPtr, complexFloat.val[0]); | ||
225 | vst1q_f32(qBufferPtr, complexFloat.val[1]); | ||
226 | complexVectorPtr += 8; | ||
227 | iBufferPtr += 4; | ||
228 | qBufferPtr += 4; | ||
229 | } | ||
230 | |||
231 | for (number = eighth_points * 4; number < num_points; number++) { | ||
232 | *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | ||
233 | *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | ||
234 | } | ||
235 | } | ||
236 | #endif /* LV_HAVE_GENERIC */ | ||
237 | |||
238 | #ifdef LV_HAVE_ORC | ||
239 | extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, | ||
240 | float* qBuffer, | ||
241 | const lv_16sc_t* complexVector, | ||
242 | const float scalar, | ||
243 | unsigned int num_points); | ||
244 | |||
245 | static inline void | ||
246 | 2 | volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, | |
247 | float* qBuffer, | ||
248 | const lv_16sc_t* complexVector, | ||
249 | const float scalar, | ||
250 | unsigned int num_points) | ||
251 | { | ||
252 | 2 | volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl( | |
253 | iBuffer, qBuffer, complexVector, scalar, num_points); | ||
254 | 2 | } | |
255 | #endif /* LV_HAVE_ORC */ | ||
256 | |||
257 | |||
258 | #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */ | ||
259 | |||
260 | |||
261 | #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H | ||
262 | #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H | ||
263 | |||
264 | #include <inttypes.h> | ||
265 | #include <stdio.h> | ||
266 | #include <volk/volk_common.h> | ||
267 | |||
268 | #ifdef LV_HAVE_AVX2 | ||
269 | #include <immintrin.h> | ||
270 | |||
271 | static inline void | ||
272 | 2 | volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, | |
273 | float* qBuffer, | ||
274 | const lv_16sc_t* complexVector, | ||
275 | const float scalar, | ||
276 | unsigned int num_points) | ||
277 | { | ||
278 | 2 | float* iBufferPtr = iBuffer; | |
279 | 2 | float* qBufferPtr = qBuffer; | |
280 | |||
281 | 2 | uint64_t number = 0; | |
282 | 2 | const uint64_t eighthPoints = num_points / 8; | |
283 | __m256 cplxValue1, cplxValue2, iValue, qValue; | ||
284 | __m256i cplxValueA, cplxValueB; | ||
285 | __m128i cplxValue128; | ||
286 | |||
287 | 2 | __m256 invScalar = _mm256_set1_ps(1.0 / scalar); | |
288 | 2 | int16_t* complexVectorPtr = (int16_t*)complexVector; | |
289 | 2 | __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
290 | |||
291 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
292 | |||
293 | 32766 | cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
294 | 32766 | complexVectorPtr += 16; | |
295 | |||
296 | // cvt | ||
297 | 32766 | cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); | |
298 | 32766 | cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); | |
299 | 32766 | cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); | |
300 | 32766 | cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); | |
301 | 32766 | cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); | |
302 | 32766 | cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); | |
303 | |||
304 | 32766 | cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); | |
305 | 32766 | cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); | |
306 | |||
307 | // Arrange in i1i2i3i4 format | ||
308 | 32766 | iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); | |
309 | 32766 | iValue = _mm256_permutevar8x32_ps(iValue, idx); | |
310 | // Arrange in q1q2q3q4 format | ||
311 | 32766 | qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); | |
312 | 32766 | qValue = _mm256_permutevar8x32_ps(qValue, idx); | |
313 | |||
314 | _mm256_storeu_ps(iBufferPtr, iValue); | ||
315 | _mm256_storeu_ps(qBufferPtr, qValue); | ||
316 | |||
317 | 32766 | iBufferPtr += 8; | |
318 | 32766 | qBufferPtr += 8; | |
319 | } | ||
320 | |||
321 | 2 | number = eighthPoints * 8; | |
322 | 2 | complexVectorPtr = (int16_t*)&complexVector[number]; | |
323 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
324 | 14 | *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
325 | 14 | *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; | |
326 | } | ||
327 | 2 | } | |
328 | #endif /* LV_HAVE_AVX2 */ | ||
329 | |||
330 | #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */ | ||
331 |