GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 156 156 100.0%
Functions: 5 5 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_8ic_s32f_deinterleave_32f_x2
12 *
13 * \b Overview
14 *
15 * Deinterleaves the complex 8-bit char vector into I & Q vector data,
16 * converts them to floats, and divides the results by the scalar
17 * factor.
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
22 * complexVector, const float scalar, unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li complexVector: The complex input vector.
26 * \li scalar: The scalar value used to divide the floating point results.
27 * \li num_points: The number of complex data values to be deinterleaved.
28 *
29 * \b Outputs
30 * \li iBuffer: The I buffer output data.
31 * \li qBuffer: The Q buffer output data.
32 *
33 * \b Example
34 * \code
35 * int N = 10000;
36 *
37 * volk_8ic_s32f_deinterleave_32f_x2();
38 *
39 * volk_free(x);
40 * \endcode
41 */
42
43 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
44 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
45
46 #include <inttypes.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49
50
51 #ifdef LV_HAVE_SSE4_1
52 #include <smmintrin.h>
53
54 static inline void
55 2 volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
56 float* qBuffer,
57 const lv_8sc_t* complexVector,
58 const float scalar,
59 unsigned int num_points)
60 {
61 2 float* iBufferPtr = iBuffer;
62 2 float* qBufferPtr = qBuffer;
63
64 2 unsigned int number = 0;
65 2 const unsigned int eighthPoints = num_points / 8;
66 __m128 iFloatValue, qFloatValue;
67
68 2 const float iScalar = 1.0 / scalar;
69 2 __m128 invScalar = _mm_set_ps1(iScalar);
70 __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
71 2 int8_t* complexVectorPtr = (int8_t*)complexVector;
72
73 2 __m128i iMoveMask = _mm_set_epi8(
74 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
75 2 __m128i qMoveMask = _mm_set_epi8(
76 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
77
78
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
79 32766 complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
80 32766 complexVectorPtr += 16;
81 32766 iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
82 32766 qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
83
84 32766 iIntVal = _mm_cvtepi8_epi32(iComplexVal);
85 32766 iFloatValue = _mm_cvtepi32_ps(iIntVal);
86 32766 iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
87 _mm_store_ps(iBufferPtr, iFloatValue);
88 32766 iBufferPtr += 4;
89
90 32766 iComplexVal = _mm_srli_si128(iComplexVal, 4);
91
92 32766 iIntVal = _mm_cvtepi8_epi32(iComplexVal);
93 32766 iFloatValue = _mm_cvtepi32_ps(iIntVal);
94 32766 iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
95 _mm_store_ps(iBufferPtr, iFloatValue);
96 32766 iBufferPtr += 4;
97
98 32766 qIntVal = _mm_cvtepi8_epi32(qComplexVal);
99 32766 qFloatValue = _mm_cvtepi32_ps(qIntVal);
100 32766 qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
101 _mm_store_ps(qBufferPtr, qFloatValue);
102 32766 qBufferPtr += 4;
103
104 32766 qComplexVal = _mm_srli_si128(qComplexVal, 4);
105
106 32766 qIntVal = _mm_cvtepi8_epi32(qComplexVal);
107 32766 qFloatValue = _mm_cvtepi32_ps(qIntVal);
108 32766 qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
109 _mm_store_ps(qBufferPtr, qFloatValue);
110
111 32766 qBufferPtr += 4;
112 }
113
114 2 number = eighthPoints * 8;
115
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
116 14 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
117 14 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
118 }
119 2 }
120 #endif /* LV_HAVE_SSE4_1 */
121
122
123 #ifdef LV_HAVE_SSE
124 #include <xmmintrin.h>
125
126 2 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
127 float* qBuffer,
128 const lv_8sc_t* complexVector,
129 const float scalar,
130 unsigned int num_points)
131 {
132 2 float* iBufferPtr = iBuffer;
133 2 float* qBufferPtr = qBuffer;
134
135 2 unsigned int number = 0;
136 2 const unsigned int quarterPoints = num_points / 4;
137 __m128 cplxValue1, cplxValue2, iValue, qValue;
138
139 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
140 2 int8_t* complexVectorPtr = (int8_t*)complexVector;
141
142 __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
143
144
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
145 65534 floatBuffer[0] = (float)(complexVectorPtr[0]);
146 65534 floatBuffer[1] = (float)(complexVectorPtr[1]);
147 65534 floatBuffer[2] = (float)(complexVectorPtr[2]);
148 65534 floatBuffer[3] = (float)(complexVectorPtr[3]);
149
150 65534 floatBuffer[4] = (float)(complexVectorPtr[4]);
151 65534 floatBuffer[5] = (float)(complexVectorPtr[5]);
152 65534 floatBuffer[6] = (float)(complexVectorPtr[6]);
153 65534 floatBuffer[7] = (float)(complexVectorPtr[7]);
154
155 65534 cplxValue1 = _mm_load_ps(&floatBuffer[0]);
156 65534 cplxValue2 = _mm_load_ps(&floatBuffer[4]);
157
158 65534 complexVectorPtr += 8;
159
160 65534 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
161 65534 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
162
163 // Arrange in i1i2i3i4 format
164 65534 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
165 65534 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
166
167 _mm_store_ps(iBufferPtr, iValue);
168 _mm_store_ps(qBufferPtr, qValue);
169
170 65534 iBufferPtr += 4;
171 65534 qBufferPtr += 4;
172 }
173
174 2 number = quarterPoints * 4;
175 2 complexVectorPtr = (int8_t*)&complexVector[number];
176
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
177 6 *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
178 6 *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
179 }
180 2 }
181 #endif /* LV_HAVE_SSE */
182
183
184 #ifdef LV_HAVE_AVX2
185 #include <immintrin.h>
186
187 2 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
188 float* qBuffer,
189 const lv_8sc_t* complexVector,
190 const float scalar,
191 unsigned int num_points)
192 {
193 2 float* iBufferPtr = iBuffer;
194 2 float* qBufferPtr = qBuffer;
195
196 2 unsigned int number = 0;
197 2 const unsigned int sixteenthPoints = num_points / 16;
198 __m256 iFloatValue, qFloatValue;
199
200 2 const float iScalar = 1.0 / scalar;
201 2 __m256 invScalar = _mm256_set1_ps(iScalar);
202 __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
203 2 int8_t* complexVectorPtr = (int8_t*)complexVector;
204
205 2 __m256i iMoveMask = _mm256_set_epi8(0x80,
206 0x80,
207 0x80,
208 0x80,
209 0x80,
210 0x80,
211 0x80,
212 0x80,
213 14,
214 12,
215 10,
216 8,
217 6,
218 4,
219 2,
220 0,
221 0x80,
222 0x80,
223 0x80,
224 0x80,
225 0x80,
226 0x80,
227 0x80,
228 0x80,
229 14,
230 12,
231 10,
232 8,
233 6,
234 4,
235 2,
236 0);
237 2 __m256i qMoveMask = _mm256_set_epi8(0x80,
238 0x80,
239 0x80,
240 0x80,
241 0x80,
242 0x80,
243 0x80,
244 0x80,
245 15,
246 13,
247 11,
248 9,
249 7,
250 5,
251 3,
252 1,
253 0x80,
254 0x80,
255 0x80,
256 0x80,
257 0x80,
258 0x80,
259 0x80,
260 0x80,
261 15,
262 13,
263 11,
264 9,
265 7,
266 5,
267 3,
268 1);
269
270
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
271 16382 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
272 16382 complexVectorPtr += 32;
273 16382 iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
274 16382 qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
275
276 32764 iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
277 16382 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
278 16382 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
279 _mm256_store_ps(iBufferPtr, iFloatValue);
280 16382 iBufferPtr += 8;
281
282 16382 iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
283 32764 iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
284 16382 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
285 16382 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
286 _mm256_store_ps(iBufferPtr, iFloatValue);
287 16382 iBufferPtr += 8;
288
289 32764 qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
290 16382 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
291 16382 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
292 _mm256_store_ps(qBufferPtr, qFloatValue);
293 16382 qBufferPtr += 8;
294
295 16382 qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
296 32764 qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
297 16382 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
298 16382 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
299 _mm256_store_ps(qBufferPtr, qFloatValue);
300 16382 qBufferPtr += 8;
301 }
302
303 2 number = sixteenthPoints * 16;
304
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
305 30 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
306 30 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
307 }
308 2 }
309 #endif /* LV_HAVE_AVX2 */
310
311
312 #ifdef LV_HAVE_GENERIC
313
314 static inline void
315 2 volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
316 float* qBuffer,
317 const lv_8sc_t* complexVector,
318 const float scalar,
319 unsigned int num_points)
320 {
321 2 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
322 2 float* iBufferPtr = iBuffer;
323 2 float* qBufferPtr = qBuffer;
324 unsigned int number;
325 2 const float invScalar = 1.0 / scalar;
326
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
327 262142 *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
328 262142 *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
329 }
330 2 }
331 #endif /* LV_HAVE_GENERIC */
332
333
334 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
335
336
337 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
338 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
339
340 #include <inttypes.h>
341 #include <stdio.h>
342 #include <volk/volk_common.h>
343
344 #ifdef LV_HAVE_AVX2
345 #include <immintrin.h>
346
347 2 static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
348 float* qBuffer,
349 const lv_8sc_t* complexVector,
350 const float scalar,
351 unsigned int num_points)
352 {
353 2 float* iBufferPtr = iBuffer;
354 2 float* qBufferPtr = qBuffer;
355
356 2 unsigned int number = 0;
357 2 const unsigned int sixteenthPoints = num_points / 16;
358 __m256 iFloatValue, qFloatValue;
359
360 2 const float iScalar = 1.0 / scalar;
361 2 __m256 invScalar = _mm256_set1_ps(iScalar);
362 __m256i complexVal, iIntVal, qIntVal;
363 __m128i iComplexVal, qComplexVal;
364 2 int8_t* complexVectorPtr = (int8_t*)complexVector;
365
366 2 __m256i MoveMask = _mm256_set_epi8(15,
367 13,
368 11,
369 9,
370 7,
371 5,
372 3,
373 1,
374 14,
375 12,
376 10,
377 8,
378 6,
379 4,
380 2,
381 0,
382 15,
383 13,
384 11,
385 9,
386 7,
387 5,
388 3,
389 1,
390 14,
391 12,
392 10,
393 8,
394 6,
395 4,
396 2,
397 0);
398
399
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
400 16382 complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
401 16382 complexVectorPtr += 32;
402 16382 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
403 16382 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
404 16382 iComplexVal = _mm256_extractf128_si256(complexVal, 0);
405 16382 qComplexVal = _mm256_extractf128_si256(complexVal, 1);
406
407 16382 iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
408 16382 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
409 16382 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
410 _mm256_storeu_ps(iBufferPtr, iFloatValue);
411 16382 iBufferPtr += 8;
412
413 16382 qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
414 16382 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
415 16382 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
416 _mm256_storeu_ps(qBufferPtr, qFloatValue);
417 16382 qBufferPtr += 8;
418
419 16382 complexVal = _mm256_srli_si256(complexVal, 8);
420 16382 iComplexVal = _mm256_extractf128_si256(complexVal, 0);
421 16382 qComplexVal = _mm256_extractf128_si256(complexVal, 1);
422
423 16382 iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
424 16382 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
425 16382 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
426 _mm256_storeu_ps(iBufferPtr, iFloatValue);
427 16382 iBufferPtr += 8;
428
429 16382 qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
430 16382 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
431 16382 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
432 _mm256_storeu_ps(qBufferPtr, qFloatValue);
433 16382 qBufferPtr += 8;
434 }
435
436 2 number = sixteenthPoints * 16;
437
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
438 30 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
439 30 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
440 }
441 2 }
442 #endif /* LV_HAVE_AVX2 */
443
444 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
445