Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16i_convert_8i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Converts 16-bit shorts to 8-bit chars. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int | ||
20 | * num_points) \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li inputVector: The input vector of 16-bit shorts. | ||
24 | * \li num_points: The number of complex data points. | ||
25 | * | ||
26 | * \b Outputs | ||
27 | * \li outputVector: The output vector of 8-bit chars. | ||
28 | * | ||
29 | * \b Example | ||
30 | * \code | ||
31 | * int N = 10000; | ||
32 | * | ||
33 | * volk_16i_convert_8i(); | ||
34 | * | ||
35 | * volk_free(x); | ||
36 | * volk_free(t); | ||
37 | * \endcode | ||
38 | */ | ||
39 | |||
40 | #ifndef INCLUDED_volk_16i_convert_8i_u_H | ||
41 | #define INCLUDED_volk_16i_convert_8i_u_H | ||
42 | |||
43 | #include <inttypes.h> | ||
44 | #include <stdio.h> | ||
45 | |||
46 | #ifdef LV_HAVE_AVX2 | ||
47 | #include <immintrin.h> | ||
48 | |||
49 | 2 | static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector, | |
50 | const int16_t* inputVector, | ||
51 | unsigned int num_points) | ||
52 | { | ||
53 | 2 | unsigned int number = 0; | |
54 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
55 | |||
56 | 2 | int8_t* outputVectorPtr = outputVector; | |
57 | 2 | int16_t* inputPtr = (int16_t*)inputVector; | |
58 | __m256i inputVal1; | ||
59 | __m256i inputVal2; | ||
60 | __m256i ret; | ||
61 | |||
62 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
63 | |||
64 | // Load the 16 values | ||
65 | 8190 | inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); | |
66 | 8190 | inputPtr += 16; | |
67 | 8190 | inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); | |
68 | 8190 | inputPtr += 16; | |
69 | |||
70 | 8190 | inputVal1 = _mm256_srai_epi16(inputVal1, 8); | |
71 | 8190 | inputVal2 = _mm256_srai_epi16(inputVal2, 8); | |
72 | |||
73 | 8190 | ret = _mm256_packs_epi16(inputVal1, inputVal2); | |
74 | 8190 | ret = _mm256_permute4x64_epi64(ret, 0b11011000); | |
75 | |||
76 | _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); | ||
77 | |||
78 | 8190 | outputVectorPtr += 32; | |
79 | } | ||
80 | |||
81 | 2 | number = thirtysecondPoints * 32; | |
82 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
83 | 62 | outputVector[number] = (int8_t)(inputVector[number] >> 8); | |
84 | } | ||
85 | 2 | } | |
86 | #endif /* LV_HAVE_AVX2 */ | ||
87 | |||
88 | |||
89 | #ifdef LV_HAVE_SSE2 | ||
90 | #include <emmintrin.h> | ||
91 | |||
92 | 2 | static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, | |
93 | const int16_t* inputVector, | ||
94 | unsigned int num_points) | ||
95 | { | ||
96 | 2 | unsigned int number = 0; | |
97 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
98 | |||
99 | 2 | int8_t* outputVectorPtr = outputVector; | |
100 | 2 | int16_t* inputPtr = (int16_t*)inputVector; | |
101 | __m128i inputVal1; | ||
102 | __m128i inputVal2; | ||
103 | __m128i ret; | ||
104 | |||
105 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
106 | |||
107 | // Load the 16 values | ||
108 | 16382 | inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); | |
109 | 16382 | inputPtr += 8; | |
110 | 16382 | inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); | |
111 | 16382 | inputPtr += 8; | |
112 | |||
113 | 16382 | inputVal1 = _mm_srai_epi16(inputVal1, 8); | |
114 | 16382 | inputVal2 = _mm_srai_epi16(inputVal2, 8); | |
115 | |||
116 | 16382 | ret = _mm_packs_epi16(inputVal1, inputVal2); | |
117 | |||
118 | _mm_storeu_si128((__m128i*)outputVectorPtr, ret); | ||
119 | |||
120 | 16382 | outputVectorPtr += 16; | |
121 | } | ||
122 | |||
123 | 2 | number = sixteenthPoints * 16; | |
124 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
125 | 30 | outputVector[number] = (int8_t)(inputVector[number] >> 8); | |
126 | } | ||
127 | 2 | } | |
128 | #endif /* LV_HAVE_SSE2 */ | ||
129 | |||
130 | |||
131 | #ifdef LV_HAVE_GENERIC | ||
132 | |||
133 | 2 | static inline void volk_16i_convert_8i_generic(int8_t* outputVector, | |
134 | const int16_t* inputVector, | ||
135 | unsigned int num_points) | ||
136 | { | ||
137 | 2 | int8_t* outputVectorPtr = outputVector; | |
138 | 2 | const int16_t* inputVectorPtr = inputVector; | |
139 | 2 | unsigned int number = 0; | |
140 | |||
141 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
142 | 262142 | *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); | |
143 | } | ||
144 | 2 | } | |
145 | #endif /* LV_HAVE_GENERIC */ | ||
146 | |||
147 | |||
148 | #endif /* INCLUDED_volk_16i_convert_8i_u_H */ | ||
149 | #ifndef INCLUDED_volk_16i_convert_8i_a_H | ||
150 | #define INCLUDED_volk_16i_convert_8i_a_H | ||
151 | |||
152 | #include <inttypes.h> | ||
153 | #include <stdio.h> | ||
154 | |||
155 | #ifdef LV_HAVE_AVX2 | ||
156 | #include <immintrin.h> | ||
157 | |||
158 | 2 | static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector, | |
159 | const int16_t* inputVector, | ||
160 | unsigned int num_points) | ||
161 | { | ||
162 | 2 | unsigned int number = 0; | |
163 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
164 | |||
165 | 2 | int8_t* outputVectorPtr = outputVector; | |
166 | 2 | int16_t* inputPtr = (int16_t*)inputVector; | |
167 | __m256i inputVal1; | ||
168 | __m256i inputVal2; | ||
169 | __m256i ret; | ||
170 | |||
171 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
172 | |||
173 | // Load the 16 values | ||
174 | 8190 | inputVal1 = _mm256_load_si256((__m256i*)inputPtr); | |
175 | 8190 | inputPtr += 16; | |
176 | 8190 | inputVal2 = _mm256_load_si256((__m256i*)inputPtr); | |
177 | 8190 | inputPtr += 16; | |
178 | |||
179 | 8190 | inputVal1 = _mm256_srai_epi16(inputVal1, 8); | |
180 | 8190 | inputVal2 = _mm256_srai_epi16(inputVal2, 8); | |
181 | |||
182 | 8190 | ret = _mm256_packs_epi16(inputVal1, inputVal2); | |
183 | 8190 | ret = _mm256_permute4x64_epi64(ret, 0b11011000); | |
184 | |||
185 | _mm256_store_si256((__m256i*)outputVectorPtr, ret); | ||
186 | |||
187 | 8190 | outputVectorPtr += 32; | |
188 | } | ||
189 | |||
190 | 2 | number = thirtysecondPoints * 32; | |
191 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
192 | 62 | outputVector[number] = (int8_t)(inputVector[number] >> 8); | |
193 | } | ||
194 | 2 | } | |
195 | #endif /* LV_HAVE_AVX2 */ | ||
196 | |||
197 | |||
198 | #ifdef LV_HAVE_SSE2 | ||
199 | #include <emmintrin.h> | ||
200 | |||
201 | 2 | static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, | |
202 | const int16_t* inputVector, | ||
203 | unsigned int num_points) | ||
204 | { | ||
205 | 2 | unsigned int number = 0; | |
206 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
207 | |||
208 | 2 | int8_t* outputVectorPtr = outputVector; | |
209 | 2 | int16_t* inputPtr = (int16_t*)inputVector; | |
210 | __m128i inputVal1; | ||
211 | __m128i inputVal2; | ||
212 | __m128i ret; | ||
213 | |||
214 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
215 | |||
216 | // Load the 16 values | ||
217 | 16382 | inputVal1 = _mm_load_si128((__m128i*)inputPtr); | |
218 | 16382 | inputPtr += 8; | |
219 | 16382 | inputVal2 = _mm_load_si128((__m128i*)inputPtr); | |
220 | 16382 | inputPtr += 8; | |
221 | |||
222 | 16382 | inputVal1 = _mm_srai_epi16(inputVal1, 8); | |
223 | 16382 | inputVal2 = _mm_srai_epi16(inputVal2, 8); | |
224 | |||
225 | 16382 | ret = _mm_packs_epi16(inputVal1, inputVal2); | |
226 | |||
227 | _mm_store_si128((__m128i*)outputVectorPtr, ret); | ||
228 | |||
229 | 16382 | outputVectorPtr += 16; | |
230 | } | ||
231 | |||
232 | 2 | number = sixteenthPoints * 16; | |
233 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
234 | 30 | outputVector[number] = (int8_t)(inputVector[number] >> 8); | |
235 | } | ||
236 | 2 | } | |
237 | #endif /* LV_HAVE_SSE2 */ | ||
238 | |||
239 | |||
240 | #ifdef LV_HAVE_NEON | ||
241 | #include <arm_neon.h> | ||
242 | |||
243 | static inline void volk_16i_convert_8i_neon(int8_t* outputVector, | ||
244 | const int16_t* inputVector, | ||
245 | unsigned int num_points) | ||
246 | { | ||
247 | int8_t* outputVectorPtr = outputVector; | ||
248 | const int16_t* inputVectorPtr = inputVector; | ||
249 | unsigned int number = 0; | ||
250 | unsigned int sixteenth_points = num_points / 16; | ||
251 | |||
252 | int16x8_t inputVal0; | ||
253 | int16x8_t inputVal1; | ||
254 | int8x8_t outputVal0; | ||
255 | int8x8_t outputVal1; | ||
256 | int8x16_t outputVal; | ||
257 | |||
258 | for (number = 0; number < sixteenth_points; number++) { | ||
259 | // load two input vectors | ||
260 | inputVal0 = vld1q_s16(inputVectorPtr); | ||
261 | inputVal1 = vld1q_s16(inputVectorPtr + 8); | ||
262 | // shift right | ||
263 | outputVal0 = vshrn_n_s16(inputVal0, 8); | ||
264 | outputVal1 = vshrn_n_s16(inputVal1, 8); | ||
265 | // squash two vectors and write output | ||
266 | outputVal = vcombine_s8(outputVal0, outputVal1); | ||
267 | vst1q_s8(outputVectorPtr, outputVal); | ||
268 | inputVectorPtr += 16; | ||
269 | outputVectorPtr += 16; | ||
270 | } | ||
271 | |||
272 | for (number = sixteenth_points * 16; number < num_points; number++) { | ||
273 | *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); | ||
274 | } | ||
275 | } | ||
276 | #endif /* LV_HAVE_NEON */ | ||
277 | |||
278 | |||
279 | #ifdef LV_HAVE_GENERIC | ||
280 | |||
281 | 2 | static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, | |
282 | const int16_t* inputVector, | ||
283 | unsigned int num_points) | ||
284 | { | ||
285 | 2 | int8_t* outputVectorPtr = outputVector; | |
286 | 2 | const int16_t* inputVectorPtr = inputVector; | |
287 | 2 | unsigned int number = 0; | |
288 | |||
289 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
290 | 262142 | *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); | |
291 | } | ||
292 | 2 | } | |
293 | #endif /* LV_HAVE_GENERIC */ | ||
294 | |||
295 | #endif /* INCLUDED_volk_16i_convert_8i_a_H */ | ||
296 |