GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16ic_deinterleave_real_8i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 113 113 100.0%
Functions: 5 5 100.0%
Branches: 14 14 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16ic_deinterleave_real_8i
12 *
13 * \b Overview
14 *
15 * Deinterleaves the complex 16 bit vector and returns the real
16 * (inphase) part of the signal as an 8-bit value.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector,
21 * unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li complexVector: The complex input vector.
25 * \li num_points: The number of complex data values to be deinterleaved.
26 *
27 * \b Outputs
28 * \li iBuffer: The I buffer output data with 8-bit precision.
29 *
30 * \b Example
31 * \code
32 * int N = 10000;
33 *
34 * volk_16ic_deinterleave_real_8i();
35 *
36 * volk_free(x);
37 * volk_free(t);
38 * \endcode
39 */
40
41 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
43
44 #include <inttypes.h>
45 #include <stdio.h>
46
47
48 #ifdef LV_HAVE_AVX2
49 #include <immintrin.h>
50
51 2 static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
52 const lv_16sc_t* complexVector,
53 unsigned int num_points)
54 {
55 2 unsigned int number = 0;
56 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 2 int8_t* iBufferPtr = iBuffer;
58 2 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
59 0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 0x80,
65 0x80,
66 13,
67 12,
68 9,
69 8,
70 5,
71 4,
72 1,
73 0,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 0x80,
81 0x80,
82 13,
83 12,
84 9,
85 8,
86 5,
87 4,
88 1,
89 0);
90 2 __m256i iMoveMask2 = _mm256_set_epi8(13,
91 12,
92 9,
93 8,
94 5,
95 4,
96 1,
97 0,
98 0x80,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 0x80,
105 0x80,
106 13,
107 12,
108 9,
109 8,
110 5,
111 4,
112 1,
113 0,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80,
120 0x80,
121 0x80);
122 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
123
124 2 unsigned int thirtysecondPoints = num_points / 32;
125
126
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (number = 0; number < thirtysecondPoints; number++) {
127 8190 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128 8190 complexVectorPtr += 32;
129 8190 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 8190 complexVectorPtr += 32;
131
132 8190 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133 8190 complexVectorPtr += 32;
134 8190 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135 8190 complexVectorPtr += 32;
136
137 8190 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138 8190 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
139
140 8190 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141 8190 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
142
143 8190 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144 8190 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
145
146 8190 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147 8190 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
148
149 8190 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150 8190 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
151
152 8190 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153 8190 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
154
155 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
156
157 8190 iBufferPtr += 32;
158 }
159
160 2 number = thirtysecondPoints * 32;
161 2 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
163 62 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164 62 int16ComplexVectorPtr++;
165 }
166 2 }
167 #endif /* LV_HAVE_AVX2 */
168
169
170 #ifdef LV_HAVE_SSSE3
171 #include <tmmintrin.h>
172
173 2 static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
174 const lv_16sc_t* complexVector,
175 unsigned int num_points)
176 {
177 2 unsigned int number = 0;
178 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
179 2 int8_t* iBufferPtr = iBuffer;
180 2 __m128i iMoveMask1 = _mm_set_epi8(
181 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
182 2 __m128i iMoveMask2 = _mm_set_epi8(
183 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
185
186 2 unsigned int sixteenthPoints = num_points / 16;
187
188
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (number = 0; number < sixteenthPoints; number++) {
189 16382 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
190 16382 complexVectorPtr += 16;
191 16382 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
192 16382 complexVectorPtr += 16;
193
194 16382 complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
195 16382 complexVectorPtr += 16;
196 16382 complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
197 16382 complexVectorPtr += 16;
198
199 16382 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
200 16382 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
201
202 16382 complexVal1 = _mm_or_si128(complexVal1, complexVal2);
203
204 16382 complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
205 16382 complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
206
207 16382 complexVal3 = _mm_or_si128(complexVal3, complexVal4);
208
209
210 16382 complexVal1 = _mm_srai_epi16(complexVal1, 8);
211 16382 complexVal3 = _mm_srai_epi16(complexVal3, 8);
212
213 16382 iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
214
215 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
216
217 16382 iBufferPtr += 16;
218 }
219
220 2 number = sixteenthPoints * 16;
221 2 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
223 30 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224 30 int16ComplexVectorPtr++;
225 }
226 2 }
227 #endif /* LV_HAVE_SSSE3 */
228
229 #ifdef LV_HAVE_GENERIC
230
231 2 static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
232 const lv_16sc_t* complexVector,
233 unsigned int num_points)
234 {
235 2 unsigned int number = 0;
236 2 int16_t* complexVectorPtr = (int16_t*)complexVector;
237 2 int8_t* iBufferPtr = iBuffer;
238
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
239 262142 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
240 262142 complexVectorPtr++;
241 }
242 2 }
243 #endif /* LV_HAVE_GENERIC */
244
245 #ifdef LV_HAVE_NEON
246 #include <arm_neon.h>
247
248 static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
249 const lv_16sc_t* complexVector,
250 unsigned int num_points)
251 {
252 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
253 int8_t* iBufferPtr = iBuffer;
254 unsigned int eighth_points = num_points / 8;
255 unsigned int number;
256
257 int16x8x2_t complexInput;
258 int8x8_t realOutput;
259 for (number = 0; number < eighth_points; number++) {
260 complexInput = vld2q_s16(complexVectorPtr);
261 realOutput = vshrn_n_s16(complexInput.val[0], 8);
262 vst1_s8(iBufferPtr, realOutput);
263 complexVectorPtr += 16;
264 iBufferPtr += 8;
265 }
266
267 for (number = eighth_points * 8; number < num_points; number++) {
268 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
269 complexVectorPtr++;
270 }
271 }
272 #endif
273
274 #ifdef LV_HAVE_ORC
275
276 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
277 const lv_16sc_t* complexVector,
278 unsigned int num_points);
279
280 2 static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
281 const lv_16sc_t* complexVector,
282 unsigned int num_points)
283 {
284 2 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
285 2 }
286 #endif /* LV_HAVE_ORC */
287
288
289 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
290
291 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
292 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
293
294 #include <inttypes.h>
295 #include <stdio.h>
296
297
298 #ifdef LV_HAVE_AVX2
299 #include <immintrin.h>
300
301 2 static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
302 const lv_16sc_t* complexVector,
303 unsigned int num_points)
304 {
305 2 unsigned int number = 0;
306 2 const int8_t* complexVectorPtr = (int8_t*)complexVector;
307 2 int8_t* iBufferPtr = iBuffer;
308 2 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
309 0x80,
310 0x80,
311 0x80,
312 0x80,
313 0x80,
314 0x80,
315 0x80,
316 13,
317 12,
318 9,
319 8,
320 5,
321 4,
322 1,
323 0,
324 0x80,
325 0x80,
326 0x80,
327 0x80,
328 0x80,
329 0x80,
330 0x80,
331 0x80,
332 13,
333 12,
334 9,
335 8,
336 5,
337 4,
338 1,
339 0);
340 2 __m256i iMoveMask2 = _mm256_set_epi8(13,
341 12,
342 9,
343 8,
344 5,
345 4,
346 1,
347 0,
348 0x80,
349 0x80,
350 0x80,
351 0x80,
352 0x80,
353 0x80,
354 0x80,
355 0x80,
356 13,
357 12,
358 9,
359 8,
360 5,
361 4,
362 1,
363 0,
364 0x80,
365 0x80,
366 0x80,
367 0x80,
368 0x80,
369 0x80,
370 0x80,
371 0x80);
372 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
373
374 2 unsigned int thirtysecondPoints = num_points / 32;
375
376
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (number = 0; number < thirtysecondPoints; number++) {
377 8190 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
378 8190 complexVectorPtr += 32;
379 8190 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
380 8190 complexVectorPtr += 32;
381
382 8190 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
383 8190 complexVectorPtr += 32;
384 8190 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
385 8190 complexVectorPtr += 32;
386
387 8190 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
388 8190 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
389
390 8190 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
391 8190 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
392
393 8190 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
394 8190 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
395
396 8190 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
397 8190 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
398
399 8190 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
400 8190 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
401
402 8190 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
403 8190 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
404
405 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
406
407 8190 iBufferPtr += 32;
408 }
409
410 2 number = thirtysecondPoints * 32;
411 2 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
412
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
413 62 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
414 62 int16ComplexVectorPtr++;
415 }
416 2 }
417 #endif /* LV_HAVE_AVX2 */
418 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
419