Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_8i_convert_16i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Convert the input vector of 8-bit chars to a vector of 16-bit | ||
16 | * shorts. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int | ||
21 | * num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li inputVector: The input vector of 8-bit chars. | ||
25 | * \li num_points: The number of values. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li outputVector: The output 16-bit shorts. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10000; | ||
33 | * | ||
34 | * volk_8i_convert_16i(); | ||
35 | * | ||
36 | * volk_free(x); | ||
37 | * \endcode | ||
38 | */ | ||
39 | |||
40 | #ifndef INCLUDED_volk_8i_convert_16i_u_H | ||
41 | #define INCLUDED_volk_8i_convert_16i_u_H | ||
42 | |||
43 | #include <inttypes.h> | ||
44 | #include <stdio.h> | ||
45 | |||
46 | #ifdef LV_HAVE_AVX2 | ||
47 | #include <immintrin.h> | ||
48 | |||
49 | 2 | static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector, | |
50 | const int8_t* inputVector, | ||
51 | unsigned int num_points) | ||
52 | { | ||
53 | 2 | unsigned int number = 0; | |
54 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
55 | |||
56 | 2 | const __m128i* inputVectorPtr = (const __m128i*)inputVector; | |
57 | 2 | __m256i* outputVectorPtr = (__m256i*)outputVector; | |
58 | __m128i inputVal; | ||
59 | __m256i ret; | ||
60 | |||
61 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
62 | 16382 | inputVal = _mm_loadu_si128(inputVectorPtr); | |
63 | 16382 | ret = _mm256_cvtepi8_epi16(inputVal); | |
64 | 16382 | ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 | |
65 | _mm256_storeu_si256(outputVectorPtr, ret); | ||
66 | |||
67 | 16382 | outputVectorPtr++; | |
68 | 16382 | inputVectorPtr++; | |
69 | } | ||
70 | |||
71 | 2 | number = sixteenthPoints * 16; | |
72 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
73 | 30 | outputVector[number] = (int16_t)(inputVector[number]) * 256; | |
74 | } | ||
75 | 2 | } | |
76 | #endif /* LV_HAVE_AVX2 */ | ||
77 | |||
78 | |||
79 | #ifdef LV_HAVE_SSE4_1 | ||
80 | #include <smmintrin.h> | ||
81 | |||
82 | 2 | static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, | |
83 | const int8_t* inputVector, | ||
84 | unsigned int num_points) | ||
85 | { | ||
86 | 2 | unsigned int number = 0; | |
87 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
88 | |||
89 | 2 | const __m128i* inputVectorPtr = (const __m128i*)inputVector; | |
90 | 2 | __m128i* outputVectorPtr = (__m128i*)outputVector; | |
91 | __m128i inputVal; | ||
92 | __m128i ret; | ||
93 | |||
94 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
95 | 16382 | inputVal = _mm_loadu_si128(inputVectorPtr); | |
96 | 16382 | ret = _mm_cvtepi8_epi16(inputVal); | |
97 | 16382 | ret = _mm_slli_epi16(ret, 8); // Multiply by 256 | |
98 | _mm_storeu_si128(outputVectorPtr, ret); | ||
99 | |||
100 | 16382 | outputVectorPtr++; | |
101 | |||
102 | 16382 | inputVal = _mm_srli_si128(inputVal, 8); | |
103 | 16382 | ret = _mm_cvtepi8_epi16(inputVal); | |
104 | 16382 | ret = _mm_slli_epi16(ret, 8); // Multiply by 256 | |
105 | _mm_storeu_si128(outputVectorPtr, ret); | ||
106 | |||
107 | 16382 | outputVectorPtr++; | |
108 | |||
109 | 16382 | inputVectorPtr++; | |
110 | } | ||
111 | |||
112 | 2 | number = sixteenthPoints * 16; | |
113 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
114 | 30 | outputVector[number] = (int16_t)(inputVector[number]) * 256; | |
115 | } | ||
116 | 2 | } | |
117 | #endif /* LV_HAVE_SSE4_1 */ | ||
118 | |||
119 | |||
120 | #ifdef LV_HAVE_GENERIC | ||
121 | |||
122 | 2 | static inline void volk_8i_convert_16i_generic(int16_t* outputVector, | |
123 | const int8_t* inputVector, | ||
124 | unsigned int num_points) | ||
125 | { | ||
126 | 2 | int16_t* outputVectorPtr = outputVector; | |
127 | 2 | const int8_t* inputVectorPtr = inputVector; | |
128 | 2 | unsigned int number = 0; | |
129 | |||
130 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
131 | 262142 | *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; | |
132 | } | ||
133 | 2 | } | |
134 | #endif /* LV_HAVE_GENERIC */ | ||
135 | |||
136 | |||
137 | #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ | ||
138 | |||
139 | |||
140 | #ifndef INCLUDED_volk_8i_convert_16i_a_H | ||
141 | #define INCLUDED_volk_8i_convert_16i_a_H | ||
142 | |||
143 | #include <inttypes.h> | ||
144 | #include <stdio.h> | ||
145 | |||
146 | #ifdef LV_HAVE_AVX2 | ||
147 | #include <immintrin.h> | ||
148 | |||
149 | 2 | static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector, | |
150 | const int8_t* inputVector, | ||
151 | unsigned int num_points) | ||
152 | { | ||
153 | 2 | unsigned int number = 0; | |
154 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
155 | |||
156 | 2 | const __m128i* inputVectorPtr = (const __m128i*)inputVector; | |
157 | 2 | __m256i* outputVectorPtr = (__m256i*)outputVector; | |
158 | __m128i inputVal; | ||
159 | __m256i ret; | ||
160 | |||
161 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
162 | 16382 | inputVal = _mm_load_si128(inputVectorPtr); | |
163 | 16382 | ret = _mm256_cvtepi8_epi16(inputVal); | |
164 | 16382 | ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 | |
165 | _mm256_store_si256(outputVectorPtr, ret); | ||
166 | |||
167 | 16382 | outputVectorPtr++; | |
168 | 16382 | inputVectorPtr++; | |
169 | } | ||
170 | |||
171 | 2 | number = sixteenthPoints * 16; | |
172 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
173 | 30 | outputVector[number] = (int16_t)(inputVector[number]) * 256; | |
174 | } | ||
175 | 2 | } | |
176 | #endif /* LV_HAVE_AVX2 */ | ||
177 | |||
178 | |||
179 | #ifdef LV_HAVE_SSE4_1 | ||
180 | #include <smmintrin.h> | ||
181 | |||
182 | 2 | static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, | |
183 | const int8_t* inputVector, | ||
184 | unsigned int num_points) | ||
185 | { | ||
186 | 2 | unsigned int number = 0; | |
187 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
188 | |||
189 | 2 | const __m128i* inputVectorPtr = (const __m128i*)inputVector; | |
190 | 2 | __m128i* outputVectorPtr = (__m128i*)outputVector; | |
191 | __m128i inputVal; | ||
192 | __m128i ret; | ||
193 | |||
194 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
195 | 16382 | inputVal = _mm_load_si128(inputVectorPtr); | |
196 | 16382 | ret = _mm_cvtepi8_epi16(inputVal); | |
197 | 16382 | ret = _mm_slli_epi16(ret, 8); // Multiply by 256 | |
198 | _mm_store_si128(outputVectorPtr, ret); | ||
199 | |||
200 | 16382 | outputVectorPtr++; | |
201 | |||
202 | 16382 | inputVal = _mm_srli_si128(inputVal, 8); | |
203 | 16382 | ret = _mm_cvtepi8_epi16(inputVal); | |
204 | 16382 | ret = _mm_slli_epi16(ret, 8); // Multiply by 256 | |
205 | _mm_store_si128(outputVectorPtr, ret); | ||
206 | |||
207 | 16382 | outputVectorPtr++; | |
208 | |||
209 | 16382 | inputVectorPtr++; | |
210 | } | ||
211 | |||
212 | 2 | number = sixteenthPoints * 16; | |
213 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
214 | 30 | outputVector[number] = (int16_t)(inputVector[number]) * 256; | |
215 | } | ||
216 | 2 | } | |
217 | #endif /* LV_HAVE_SSE4_1 */ | ||
218 | |||
219 | |||
220 | #ifdef LV_HAVE_GENERIC | ||
221 | |||
222 | 2 | static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, | |
223 | const int8_t* inputVector, | ||
224 | unsigned int num_points) | ||
225 | { | ||
226 | 2 | int16_t* outputVectorPtr = outputVector; | |
227 | 2 | const int8_t* inputVectorPtr = inputVector; | |
228 | 2 | unsigned int number = 0; | |
229 | |||
230 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
231 | 262142 | *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; | |
232 | } | ||
233 | 2 | } | |
234 | #endif /* LV_HAVE_GENERIC */ | ||
235 | |||
236 | |||
237 | #ifdef LV_HAVE_NEON | ||
238 | #include <arm_neon.h> | ||
239 | |||
240 | static inline void volk_8i_convert_16i_neon(int16_t* outputVector, | ||
241 | const int8_t* inputVector, | ||
242 | unsigned int num_points) | ||
243 | { | ||
244 | int16_t* outputVectorPtr = outputVector; | ||
245 | const int8_t* inputVectorPtr = inputVector; | ||
246 | unsigned int number; | ||
247 | const unsigned int eighth_points = num_points / 8; | ||
248 | |||
249 | int8x8_t input_vec; | ||
250 | int16x8_t converted_vec; | ||
251 | |||
252 | // NEON doesn't have a concept of 8 bit registers, so we are really | ||
253 | // dealing with the low half of 16-bit registers. Since this requires | ||
254 | // a move instruction we likely do better with ASM here. | ||
255 | for (number = 0; number < eighth_points; ++number) { | ||
256 | input_vec = vld1_s8(inputVectorPtr); | ||
257 | converted_vec = vmovl_s8(input_vec); | ||
258 | // converted_vec = vmulq_s16(converted_vec, scale_factor); | ||
259 | converted_vec = vshlq_n_s16(converted_vec, 8); | ||
260 | vst1q_s16(outputVectorPtr, converted_vec); | ||
261 | |||
262 | inputVectorPtr += 8; | ||
263 | outputVectorPtr += 8; | ||
264 | } | ||
265 | |||
266 | for (number = eighth_points * 8; number < num_points; number++) { | ||
267 | *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; | ||
268 | } | ||
269 | } | ||
270 | #endif /* LV_HAVE_NEON */ | ||
271 | |||
272 | |||
273 | #ifdef LV_HAVE_ORC | ||
274 | extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, | ||
275 | const int8_t* inputVector, | ||
276 | unsigned int num_points); | ||
277 | |||
278 | 2 | static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, | |
279 | const int8_t* inputVector, | ||
280 | unsigned int num_points) | ||
281 | { | ||
282 | 2 | volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); | |
283 | 2 | } | |
284 | #endif /* LV_HAVE_ORC */ | ||
285 | |||
286 | |||
287 | #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ | ||
288 |