GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_8ic_deinterleave_16i_x2.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 123 123 100.0%
Functions: 5 5 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_8ic_deinterleave_16i_x2
12 *
13 * \b Overview
14 *
15 * Deinterleaves the complex 8-bit char vector into I & Q vector data
16 * and converts them to 16-bit shorts.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
21 * complexVector, unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li complexVector: The complex input vector.
25 * \li num_points: The number of complex data values to be deinterleaved.
26 *
27 * \b Outputs
28 * \li iBuffer: The I buffer output data.
29 * \li qBuffer: The Q buffer output data.
30 *
31 * \b Example
32 * \code
33 * int N = 10000;
34 *
35 * volk_8ic_deinterleave_16i_x2();
36 *
37 * volk_free(x);
38 * \endcode
39 */
40
41 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
42 #define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
43
44 #include <inttypes.h>
45 #include <stdio.h>
46
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49
50 2 static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
51 int16_t* qBuffer,
52 const lv_8sc_t* complexVector,
53 unsigned int num_points)
54 {
55 2 unsigned int number = 0;
56 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 2 int16_t* iBufferPtr = iBuffer;
58 2 int16_t* qBufferPtr = qBuffer;
59 2 __m256i MoveMask = _mm256_set_epi8(15,
60 13,
61 11,
62 9,
63 7,
64 5,
65 3,
66 1,
67 14,
68 12,
69 10,
70 8,
71 6,
72 4,
73 2,
74 0,
75 15,
76 13,
77 11,
78 9,
79 7,
80 5,
81 3,
82 1,
83 14,
84 12,
85 10,
86 8,
87 6,
88 4,
89 2,
90 0);
91 __m256i complexVal, iOutputVal, qOutputVal;
92 __m128i iOutputVal0, qOutputVal0;
93
94 2 unsigned int sixteenthPoints = num_points / 16;
95
96
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < sixteenthPoints; number++) {
97 16382 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
98 16382 complexVectorPtr += 32;
99
100 16382 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
101 16382 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
102
103 16382 iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
104 16382 qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
105
106 16382 iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
107 16382 iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
108
109 16382 qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
110 16382 qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
111
112 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
113 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
114
115 16382 iBufferPtr += 16;
116 16382 qBufferPtr += 16;
117 }
118
119 2 number = sixteenthPoints * 16;
120
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
121 30 *iBufferPtr++ =
122 30 ((int16_t)*complexVectorPtr++) *
123 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
124 30 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
125 }
126 2 }
127 #endif /* LV_HAVE_AVX2 */
128
129 #ifdef LV_HAVE_SSE4_1
130 #include <smmintrin.h>
131
132 2 static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
133 int16_t* qBuffer,
134 const lv_8sc_t* complexVector,
135 unsigned int num_points)
136 {
137 2 unsigned int number = 0;
138 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
139 2 int16_t* iBufferPtr = iBuffer;
140 2 int16_t* qBufferPtr = qBuffer;
141 2 __m128i iMoveMask = _mm_set_epi8(0x80,
142 0x80,
143 0x80,
144 0x80,
145 0x80,
146 0x80,
147 0x80,
148 0x80,
149 14,
150 12,
151 10,
152 8,
153 6,
154 4,
155 2,
156 0); // set 16 byte values
157 2 __m128i qMoveMask = _mm_set_epi8(
158 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
159 __m128i complexVal, iOutputVal, qOutputVal;
160
161 2 unsigned int eighthPoints = num_points / 8;
162
163
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < eighthPoints; number++) {
164 32766 complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
165 32766 complexVectorPtr += 16; // aligned load
166
167 32766 iOutputVal = _mm_shuffle_epi8(complexVal,
168 iMoveMask); // shuffle 16 bytes of 128bit complexVal
169 32766 qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
170
171 32766 iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
172 // of lower 8 bytes of input to output
173 iOutputVal =
174 32766 _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
175 // 16-bit integers, shift in with zeros
176
177 32766 qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
178 32766 qOutputVal = _mm_slli_epi16(qOutputVal, 8);
179
180 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
181 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
182
183 32766 iBufferPtr += 8;
184 32766 qBufferPtr += 8;
185 }
186
187 2 number = eighthPoints * 8;
188
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
189 14 *iBufferPtr++ =
190 14 ((int16_t)*complexVectorPtr++) *
191 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
192 14 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
193 }
194 2 }
195 #endif /* LV_HAVE_SSE4_1 */
196
197
198 #ifdef LV_HAVE_AVX
199 #include <immintrin.h>
200
201 2 static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
202 int16_t* qBuffer,
203 const lv_8sc_t* complexVector,
204 unsigned int num_points)
205 {
206 2 unsigned int number = 0;
207 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
208 2 int16_t* iBufferPtr = iBuffer;
209 2 int16_t* qBufferPtr = qBuffer;
210 2 __m128i iMoveMask = _mm_set_epi8(0x80,
211 0x80,
212 0x80,
213 0x80,
214 0x80,
215 0x80,
216 0x80,
217 0x80,
218 14,
219 12,
220 10,
221 8,
222 6,
223 4,
224 2,
225 0); // set 16 byte values
226 2 __m128i qMoveMask = _mm_set_epi8(
227 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
228 __m256i complexVal, iOutputVal, qOutputVal;
229 __m128i complexVal1, complexVal0;
230 __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
231
232 2 unsigned int sixteenthPoints = num_points / 16;
233
234
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < sixteenthPoints; number++) {
235 16382 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
236 16382 complexVectorPtr += 32; // aligned load
237
238 // Extract from complexVal to iOutputVal and qOutputVal
239 16382 complexVal1 = _mm256_extractf128_si256(complexVal, 1);
240 16382 complexVal0 = _mm256_extractf128_si256(complexVal, 0);
241
242 16382 iOutputVal1 = _mm_shuffle_epi8(
243 complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
244 16382 iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
245 16382 qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
246 16382 qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
247
248 iOutputVal1 =
249 16382 _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
250 // lower 8 bytes of input to output
251 iOutputVal1 =
252 16382 _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
253 // 16-bit integers, shift in with zeros
254 16382 iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
255 16382 iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
256
257 16382 qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
258 16382 qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
259 16382 qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
260 16382 qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
261
262 // Pack iOutputVal0,1 to iOutputVal
263 16382 __m256i dummy = _mm256_setzero_si256();
264 16382 iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
265 16382 iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
266 16382 qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
267 16382 qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
268
269 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
270 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
271
272 16382 iBufferPtr += 16;
273 16382 qBufferPtr += 16;
274 }
275
276 2 number = sixteenthPoints * 16;
277
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
278 30 *iBufferPtr++ =
279 30 ((int16_t)*complexVectorPtr++) *
280 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
281 30 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
282 }
283 2 }
284 #endif /* LV_HAVE_AVX */
285
286
287 #ifdef LV_HAVE_GENERIC
288
289 2 static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
290 int16_t* qBuffer,
291 const lv_8sc_t* complexVector,
292 unsigned int num_points)
293 {
294 2 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
295 2 int16_t* iBufferPtr = iBuffer;
296 2 int16_t* qBufferPtr = qBuffer;
297 unsigned int number;
298
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
299 262142 *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
300 262142 *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
301 }
302 2 }
303 #endif /* LV_HAVE_GENERIC */
304
305
306 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
307
308 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
309 #define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
310
311 #include <inttypes.h>
312 #include <stdio.h>
313
314 #ifdef LV_HAVE_AVX2
315 #include <immintrin.h>
316
317 2 static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
318 int16_t* qBuffer,
319 const lv_8sc_t* complexVector,
320 unsigned int num_points)
321 {
322 2 unsigned int number = 0;
323 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
324 2 int16_t* iBufferPtr = iBuffer;
325 2 int16_t* qBufferPtr = qBuffer;
326 2 __m256i MoveMask = _mm256_set_epi8(15,
327 13,
328 11,
329 9,
330 7,
331 5,
332 3,
333 1,
334 14,
335 12,
336 10,
337 8,
338 6,
339 4,
340 2,
341 0,
342 15,
343 13,
344 11,
345 9,
346 7,
347 5,
348 3,
349 1,
350 14,
351 12,
352 10,
353 8,
354 6,
355 4,
356 2,
357 0);
358 __m256i complexVal, iOutputVal, qOutputVal;
359 __m128i iOutputVal0, qOutputVal0;
360
361 2 unsigned int sixteenthPoints = num_points / 16;
362
363
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < sixteenthPoints; number++) {
364 16382 complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
365 16382 complexVectorPtr += 32;
366
367 16382 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
368 16382 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
369
370 16382 iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
371 16382 qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
372
373 16382 iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
374 16382 iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
375
376 16382 qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
377 16382 qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
378
379 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
380 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
381
382 16382 iBufferPtr += 16;
383 16382 qBufferPtr += 16;
384 }
385
386 2 number = sixteenthPoints * 16;
387
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
388 30 *iBufferPtr++ =
389 30 ((int16_t)*complexVectorPtr++) *
390 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
391 30 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
392 }
393 2 }
394 #endif /* LV_HAVE_AVX2 */
395 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
396