Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2016 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16ic_x2_multiply_16ic | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Multiplies two input complex vectors, point-by-point, storing the result in the third | ||
16 | * vector. WARNING: Saturation is not checked. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const | ||
21 | * lv_16sc_t* in_b, unsigned int num_points); \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li in_a: One of the vectors to be multiplied. | ||
25 | * \li in_b: The other vector to be multiplied. | ||
26 | * \li num_points: The number of complex data points to be multiplied from both input | ||
27 | * vectors. | ||
28 | * | ||
29 | * \b Outputs | ||
30 | * \li result: The vector where the results will be stored. | ||
31 | * | ||
32 | */ | ||
33 | |||
34 | #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H | ||
35 | #define INCLUDED_volk_16ic_x2_multiply_16ic_H | ||
36 | |||
37 | #include <volk/volk_common.h> | ||
38 | #include <volk/volk_complex.h> | ||
39 | |||
40 | #ifdef LV_HAVE_GENERIC | ||
41 | |||
42 | 2 | static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, | |
43 | const lv_16sc_t* in_a, | ||
44 | const lv_16sc_t* in_b, | ||
45 | unsigned int num_points) | ||
46 | { | ||
47 | unsigned int n; | ||
48 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (n = 0; n < num_points; n++) { |
49 | 262142 | result[n] = in_a[n] * in_b[n]; | |
50 | } | ||
51 | 2 | } | |
52 | |||
53 | #endif /*LV_HAVE_GENERIC*/ | ||
54 | |||
55 | |||
56 | #ifdef LV_HAVE_SSE2 | ||
57 | #include <emmintrin.h> | ||
58 | |||
59 | 2 | static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, | |
60 | const lv_16sc_t* in_a, | ||
61 | const lv_16sc_t* in_b, | ||
62 | unsigned int num_points) | ||
63 | { | ||
64 | 2 | const unsigned int sse_iters = num_points / 4; | |
65 | __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, | ||
66 | result; | ||
67 | |||
68 | 2 | mask_imag = _mm_set_epi8( | |
69 | 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); | ||
70 | 2 | mask_real = _mm_set_epi8( | |
71 | 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); | ||
72 | |||
73 | 2 | const lv_16sc_t* _in_a = in_a; | |
74 | 2 | const lv_16sc_t* _in_b = in_b; | |
75 | 2 | lv_16sc_t* _out = out; | |
76 | unsigned int number; | ||
77 | |||
78 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (number = 0; number < sse_iters; number++) { |
79 | 65534 | a = _mm_load_si128( | |
80 | (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||
81 | 65534 | b = _mm_load_si128((__m128i*)_in_b); | |
82 | 65534 | c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... | |
83 | |||
84 | 65534 | c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in | |
85 | // zeros, and store the results in dst. | ||
86 | 65534 | real = _mm_subs_epi16(c, c_sr); | |
87 | 65534 | real = _mm_and_si128(real, | |
88 | mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i | ||
89 | |||
90 | 65534 | b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... | |
91 | 65534 | a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... | |
92 | |||
93 | 65534 | imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | |
94 | 65534 | imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | |
95 | |||
96 | 65534 | imag = _mm_adds_epi16(imag1, imag2); | |
97 | 65534 | imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... | |
98 | |||
99 | 65534 | result = _mm_or_si128(real, imag); | |
100 | |||
101 | _mm_store_si128((__m128i*)_out, result); | ||
102 | |||
103 | 65534 | _in_a += 4; | |
104 | 65534 | _in_b += 4; | |
105 | 65534 | _out += 4; | |
106 | } | ||
107 | |||
108 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (number = sse_iters * 4; number < num_points; ++number) { |
109 | 6 | *_out++ = (*_in_a++) * (*_in_b++); | |
110 | } | ||
111 | 2 | } | |
112 | #endif /* LV_HAVE_SSE2 */ | ||
113 | |||
114 | |||
115 | #ifdef LV_HAVE_SSE2 | ||
116 | #include <emmintrin.h> | ||
117 | |||
118 | 2 | static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, | |
119 | const lv_16sc_t* in_a, | ||
120 | const lv_16sc_t* in_b, | ||
121 | unsigned int num_points) | ||
122 | { | ||
123 | 2 | const unsigned int sse_iters = num_points / 4; | |
124 | __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, | ||
125 | result; | ||
126 | |||
127 | 2 | mask_imag = _mm_set_epi8( | |
128 | 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); | ||
129 | 2 | mask_real = _mm_set_epi8( | |
130 | 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); | ||
131 | |||
132 | 2 | const lv_16sc_t* _in_a = in_a; | |
133 | 2 | const lv_16sc_t* _in_b = in_b; | |
134 | 2 | lv_16sc_t* _out = out; | |
135 | unsigned int number; | ||
136 | |||
137 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (number = 0; number < sse_iters; number++) { |
138 | 65534 | a = _mm_loadu_si128( | |
139 | (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||
140 | 65534 | b = _mm_loadu_si128((__m128i*)_in_b); | |
141 | 65534 | c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... | |
142 | |||
143 | 65534 | c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in | |
144 | // zeros, and store the results in dst. | ||
145 | 65534 | real = _mm_subs_epi16(c, c_sr); | |
146 | 65534 | real = _mm_and_si128(real, | |
147 | mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i | ||
148 | |||
149 | 65534 | b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... | |
150 | 65534 | a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... | |
151 | |||
152 | 65534 | imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | |
153 | 65534 | imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | |
154 | |||
155 | 65534 | imag = _mm_adds_epi16(imag1, imag2); | |
156 | 65534 | imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... | |
157 | |||
158 | 65534 | result = _mm_or_si128(real, imag); | |
159 | |||
160 | _mm_storeu_si128((__m128i*)_out, result); | ||
161 | |||
162 | 65534 | _in_a += 4; | |
163 | 65534 | _in_b += 4; | |
164 | 65534 | _out += 4; | |
165 | } | ||
166 | |||
167 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (number = sse_iters * 4; number < num_points; ++number) { |
168 | 6 | *_out++ = (*_in_a++) * (*_in_b++); | |
169 | } | ||
170 | 2 | } | |
171 | #endif /* LV_HAVE_SSE2 */ | ||
172 | |||
173 | |||
174 | #ifdef LV_HAVE_AVX2 | ||
175 | #include <immintrin.h> | ||
176 | |||
177 | 2 | static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, | |
178 | const lv_16sc_t* in_a, | ||
179 | const lv_16sc_t* in_b, | ||
180 | unsigned int num_points) | ||
181 | { | ||
182 | 2 | unsigned int number = 0; | |
183 | 2 | const unsigned int avx2_points = num_points / 8; | |
184 | |||
185 | 2 | const lv_16sc_t* _in_a = in_a; | |
186 | 2 | const lv_16sc_t* _in_b = in_b; | |
187 | 2 | lv_16sc_t* _out = out; | |
188 | |||
189 | __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; | ||
190 | |||
191 | 2 | const __m256i mask_imag = _mm256_set_epi8(0xFF, | |
192 | 0xFF, | ||
193 | 0, | ||
194 | 0, | ||
195 | 0xFF, | ||
196 | 0xFF, | ||
197 | 0, | ||
198 | 0, | ||
199 | 0xFF, | ||
200 | 0xFF, | ||
201 | 0, | ||
202 | 0, | ||
203 | 0xFF, | ||
204 | 0xFF, | ||
205 | 0, | ||
206 | 0, | ||
207 | 0xFF, | ||
208 | 0xFF, | ||
209 | 0, | ||
210 | 0, | ||
211 | 0xFF, | ||
212 | 0xFF, | ||
213 | 0, | ||
214 | 0, | ||
215 | 0xFF, | ||
216 | 0xFF, | ||
217 | 0, | ||
218 | 0, | ||
219 | 0xFF, | ||
220 | 0xFF, | ||
221 | 0, | ||
222 | 0); | ||
223 | 2 | const __m256i mask_real = _mm256_set_epi8(0, | |
224 | 0, | ||
225 | 0xFF, | ||
226 | 0xFF, | ||
227 | 0, | ||
228 | 0, | ||
229 | 0xFF, | ||
230 | 0xFF, | ||
231 | 0, | ||
232 | 0, | ||
233 | 0xFF, | ||
234 | 0xFF, | ||
235 | 0, | ||
236 | 0, | ||
237 | 0xFF, | ||
238 | 0xFF, | ||
239 | 0, | ||
240 | 0, | ||
241 | 0xFF, | ||
242 | 0xFF, | ||
243 | 0, | ||
244 | 0, | ||
245 | 0xFF, | ||
246 | 0xFF, | ||
247 | 0, | ||
248 | 0, | ||
249 | 0xFF, | ||
250 | 0xFF, | ||
251 | 0, | ||
252 | 0, | ||
253 | 0xFF, | ||
254 | 0xFF); | ||
255 | |||
256 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < avx2_points; number++) { |
257 | 32766 | a = _mm256_loadu_si256( | |
258 | (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi | ||
259 | 32766 | b = _mm256_loadu_si256( | |
260 | (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di | ||
261 | 32766 | c = _mm256_mullo_epi16(a, b); | |
262 | |||
263 | 32766 | c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in | |
264 | // zeros, and store the results in dst. | ||
265 | 32766 | real = _mm256_subs_epi16(c, c_sr); | |
266 | 32766 | real = _mm256_and_si256( | |
267 | real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i | ||
268 | |||
269 | 32766 | b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... | |
270 | 32766 | a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... | |
271 | |||
272 | 32766 | imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | |
273 | 32766 | imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | |
274 | |||
275 | 32766 | imag = _mm256_adds_epi16(imag1, imag2); | |
276 | 32766 | imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... | |
277 | |||
278 | 32766 | result = _mm256_or_si256(real, imag); | |
279 | |||
280 | _mm256_storeu_si256((__m256i*)_out, result); | ||
281 | |||
282 | 32766 | _in_a += 8; | |
283 | 32766 | _in_b += 8; | |
284 | 32766 | _out += 8; | |
285 | } | ||
286 | |||
287 | 2 | number = avx2_points * 8; | |
288 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
289 | 14 | *_out++ = (*_in_a++) * (*_in_b++); | |
290 | } | ||
291 | 2 | } | |
292 | #endif /* LV_HAVE_AVX2 */ | ||
293 | |||
294 | |||
295 | #ifdef LV_HAVE_AVX2 | ||
296 | #include <immintrin.h> | ||
297 | |||
298 | 2 | static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, | |
299 | const lv_16sc_t* in_a, | ||
300 | const lv_16sc_t* in_b, | ||
301 | unsigned int num_points) | ||
302 | { | ||
303 | 2 | unsigned int number = 0; | |
304 | 2 | const unsigned int avx2_points = num_points / 8; | |
305 | |||
306 | 2 | const lv_16sc_t* _in_a = in_a; | |
307 | 2 | const lv_16sc_t* _in_b = in_b; | |
308 | 2 | lv_16sc_t* _out = out; | |
309 | |||
310 | __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; | ||
311 | |||
312 | 2 | const __m256i mask_imag = _mm256_set_epi8(0xFF, | |
313 | 0xFF, | ||
314 | 0, | ||
315 | 0, | ||
316 | 0xFF, | ||
317 | 0xFF, | ||
318 | 0, | ||
319 | 0, | ||
320 | 0xFF, | ||
321 | 0xFF, | ||
322 | 0, | ||
323 | 0, | ||
324 | 0xFF, | ||
325 | 0xFF, | ||
326 | 0, | ||
327 | 0, | ||
328 | 0xFF, | ||
329 | 0xFF, | ||
330 | 0, | ||
331 | 0, | ||
332 | 0xFF, | ||
333 | 0xFF, | ||
334 | 0, | ||
335 | 0, | ||
336 | 0xFF, | ||
337 | 0xFF, | ||
338 | 0, | ||
339 | 0, | ||
340 | 0xFF, | ||
341 | 0xFF, | ||
342 | 0, | ||
343 | 0); | ||
344 | 2 | const __m256i mask_real = _mm256_set_epi8(0, | |
345 | 0, | ||
346 | 0xFF, | ||
347 | 0xFF, | ||
348 | 0, | ||
349 | 0, | ||
350 | 0xFF, | ||
351 | 0xFF, | ||
352 | 0, | ||
353 | 0, | ||
354 | 0xFF, | ||
355 | 0xFF, | ||
356 | 0, | ||
357 | 0, | ||
358 | 0xFF, | ||
359 | 0xFF, | ||
360 | 0, | ||
361 | 0, | ||
362 | 0xFF, | ||
363 | 0xFF, | ||
364 | 0, | ||
365 | 0, | ||
366 | 0xFF, | ||
367 | 0xFF, | ||
368 | 0, | ||
369 | 0, | ||
370 | 0xFF, | ||
371 | 0xFF, | ||
372 | 0, | ||
373 | 0, | ||
374 | 0xFF, | ||
375 | 0xFF); | ||
376 | |||
377 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < avx2_points; number++) { |
378 | 32766 | a = _mm256_load_si256( | |
379 | (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi | ||
380 | 32766 | b = _mm256_load_si256( | |
381 | (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di | ||
382 | 32766 | c = _mm256_mullo_epi16(a, b); | |
383 | |||
384 | 32766 | c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in | |
385 | // zeros, and store the results in dst. | ||
386 | 32766 | real = _mm256_subs_epi16(c, c_sr); | |
387 | 32766 | real = _mm256_and_si256( | |
388 | real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i | ||
389 | |||
390 | 32766 | b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... | |
391 | 32766 | a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... | |
392 | |||
393 | 32766 | imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | |
394 | 32766 | imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | |
395 | |||
396 | 32766 | imag = _mm256_adds_epi16(imag1, imag2); | |
397 | 32766 | imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... | |
398 | |||
399 | 32766 | result = _mm256_or_si256(real, imag); | |
400 | |||
401 | _mm256_store_si256((__m256i*)_out, result); | ||
402 | |||
403 | 32766 | _in_a += 8; | |
404 | 32766 | _in_b += 8; | |
405 | 32766 | _out += 8; | |
406 | } | ||
407 | |||
408 | 2 | number = avx2_points * 8; | |
409 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
410 | 14 | *_out++ = (*_in_a++) * (*_in_b++); | |
411 | } | ||
412 | 2 | } | |
413 | #endif /* LV_HAVE_AVX2 */ | ||
414 | |||
415 | |||
416 | #ifdef LV_HAVE_NEON | ||
417 | #include <arm_neon.h> | ||
418 | |||
419 | static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, | ||
420 | const lv_16sc_t* in_a, | ||
421 | const lv_16sc_t* in_b, | ||
422 | unsigned int num_points) | ||
423 | { | ||
424 | lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; | ||
425 | lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; | ||
426 | unsigned int quarter_points = num_points / 4; | ||
427 | int16x4x2_t a_val, b_val, c_val; | ||
428 | int16x4x2_t tmp_real, tmp_imag; | ||
429 | unsigned int number = 0; | ||
430 | |||
431 | for (number = 0; number < quarter_points; ++number) { | ||
432 | a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
433 | b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
434 | __VOLK_PREFETCH(a_ptr + 4); | ||
435 | __VOLK_PREFETCH(b_ptr + 4); | ||
436 | |||
437 | // multiply the real*real and imag*imag to get real result | ||
438 | // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||
439 | tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); | ||
440 | // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i | ||
441 | tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); | ||
442 | |||
443 | // Multiply cross terms to get the imaginary result | ||
444 | // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i | ||
445 | tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); | ||
446 | // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r | ||
447 | tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); | ||
448 | |||
449 | // store the results | ||
450 | c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); | ||
451 | c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); | ||
452 | vst2_s16((int16_t*)out, c_val); | ||
453 | |||
454 | a_ptr += 4; | ||
455 | b_ptr += 4; | ||
456 | out += 4; | ||
457 | } | ||
458 | |||
459 | for (number = quarter_points * 4; number < num_points; number++) { | ||
460 | *out++ = (*a_ptr++) * (*b_ptr++); | ||
461 | } | ||
462 | } | ||
463 | #endif /* LV_HAVE_NEON */ | ||
464 | |||
465 | #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/ | ||
466 |