GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16ic_x2_multiply_16ic.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 116 116 100.0%
Functions: 5 5 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16ic_x2_multiply_16ic
12 *
13 * \b Overview
14 *
15 * Multiplies two input complex vectors, point-by-point, storing the result in the third
16 * vector. WARNING: Saturation is not checked.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
21 * lv_16sc_t* in_b, unsigned int num_points); \endcode
22 *
23 * \b Inputs
24 * \li in_a: One of the vectors to be multiplied.
25 * \li in_b: The other vector to be multiplied.
26 * \li num_points: The number of complex data points to be multiplied from both input
27 * vectors.
28 *
29 * \b Outputs
30 * \li result: The vector where the results will be stored.
31 *
32 */
33
34 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
36
37 #include <volk/volk_common.h>
38 #include <volk/volk_complex.h>
39
40 #ifdef LV_HAVE_GENERIC
41
42 2 static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
43 const lv_16sc_t* in_a,
44 const lv_16sc_t* in_b,
45 unsigned int num_points)
46 {
47 unsigned int n;
48
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (n = 0; n < num_points; n++) {
49 262142 result[n] = in_a[n] * in_b[n];
50 }
51 2 }
52
53 #endif /*LV_HAVE_GENERIC*/
54
55
56 #ifdef LV_HAVE_SSE2
57 #include <emmintrin.h>
58
59 2 static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
60 const lv_16sc_t* in_a,
61 const lv_16sc_t* in_b,
62 unsigned int num_points)
63 {
64 2 const unsigned int sse_iters = num_points / 4;
65 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
66 result;
67
68 2 mask_imag = _mm_set_epi8(
69 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
70 2 mask_real = _mm_set_epi8(
71 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
72
73 2 const lv_16sc_t* _in_a = in_a;
74 2 const lv_16sc_t* _in_b = in_b;
75 2 lv_16sc_t* _out = out;
76 unsigned int number;
77
78
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (number = 0; number < sse_iters; number++) {
79 65534 a = _mm_load_si128(
80 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
81 65534 b = _mm_load_si128((__m128i*)_in_b);
82 65534 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
83
84 65534 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
85 // zeros, and store the results in dst.
86 65534 real = _mm_subs_epi16(c, c_sr);
87 65534 real = _mm_and_si128(real,
88 mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
89
90 65534 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
91 65534 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
92
93 65534 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
94 65534 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
95
96 65534 imag = _mm_adds_epi16(imag1, imag2);
97 65534 imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
98
99 65534 result = _mm_or_si128(real, imag);
100
101 _mm_store_si128((__m128i*)_out, result);
102
103 65534 _in_a += 4;
104 65534 _in_b += 4;
105 65534 _out += 4;
106 }
107
108
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (number = sse_iters * 4; number < num_points; ++number) {
109 6 *_out++ = (*_in_a++) * (*_in_b++);
110 }
111 2 }
112 #endif /* LV_HAVE_SSE2 */
113
114
115 #ifdef LV_HAVE_SSE2
116 #include <emmintrin.h>
117
118 2 static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
119 const lv_16sc_t* in_a,
120 const lv_16sc_t* in_b,
121 unsigned int num_points)
122 {
123 2 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
125 result;
126
127 2 mask_imag = _mm_set_epi8(
128 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
129 2 mask_real = _mm_set_epi8(
130 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
131
132 2 const lv_16sc_t* _in_a = in_a;
133 2 const lv_16sc_t* _in_b = in_b;
134 2 lv_16sc_t* _out = out;
135 unsigned int number;
136
137
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (number = 0; number < sse_iters; number++) {
138 65534 a = _mm_loadu_si128(
139 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
140 65534 b = _mm_loadu_si128((__m128i*)_in_b);
141 65534 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
142
143 65534 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
144 // zeros, and store the results in dst.
145 65534 real = _mm_subs_epi16(c, c_sr);
146 65534 real = _mm_and_si128(real,
147 mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
148
149 65534 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
150 65534 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
151
152 65534 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
153 65534 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
154
155 65534 imag = _mm_adds_epi16(imag1, imag2);
156 65534 imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
157
158 65534 result = _mm_or_si128(real, imag);
159
160 _mm_storeu_si128((__m128i*)_out, result);
161
162 65534 _in_a += 4;
163 65534 _in_b += 4;
164 65534 _out += 4;
165 }
166
167
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (number = sse_iters * 4; number < num_points; ++number) {
168 6 *_out++ = (*_in_a++) * (*_in_b++);
169 }
170 2 }
171 #endif /* LV_HAVE_SSE2 */
172
173
174 #ifdef LV_HAVE_AVX2
175 #include <immintrin.h>
176
177 2 static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
178 const lv_16sc_t* in_a,
179 const lv_16sc_t* in_b,
180 unsigned int num_points)
181 {
182 2 unsigned int number = 0;
183 2 const unsigned int avx2_points = num_points / 8;
184
185 2 const lv_16sc_t* _in_a = in_a;
186 2 const lv_16sc_t* _in_b = in_b;
187 2 lv_16sc_t* _out = out;
188
189 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
190
191 2 const __m256i mask_imag = _mm256_set_epi8(0xFF,
192 0xFF,
193 0,
194 0,
195 0xFF,
196 0xFF,
197 0,
198 0,
199 0xFF,
200 0xFF,
201 0,
202 0,
203 0xFF,
204 0xFF,
205 0,
206 0,
207 0xFF,
208 0xFF,
209 0,
210 0,
211 0xFF,
212 0xFF,
213 0,
214 0,
215 0xFF,
216 0xFF,
217 0,
218 0,
219 0xFF,
220 0xFF,
221 0,
222 0);
223 2 const __m256i mask_real = _mm256_set_epi8(0,
224 0,
225 0xFF,
226 0xFF,
227 0,
228 0,
229 0xFF,
230 0xFF,
231 0,
232 0,
233 0xFF,
234 0xFF,
235 0,
236 0,
237 0xFF,
238 0xFF,
239 0,
240 0,
241 0xFF,
242 0xFF,
243 0,
244 0,
245 0xFF,
246 0xFF,
247 0,
248 0,
249 0xFF,
250 0xFF,
251 0,
252 0,
253 0xFF,
254 0xFF);
255
256
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < avx2_points; number++) {
257 32766 a = _mm256_loadu_si256(
258 (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
259 32766 b = _mm256_loadu_si256(
260 (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
261 32766 c = _mm256_mullo_epi16(a, b);
262
263 32766 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
264 // zeros, and store the results in dst.
265 32766 real = _mm256_subs_epi16(c, c_sr);
266 32766 real = _mm256_and_si256(
267 real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
268
269 32766 b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
270 32766 a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
271
272 32766 imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
273 32766 imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
274
275 32766 imag = _mm256_adds_epi16(imag1, imag2);
276 32766 imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
277
278 32766 result = _mm256_or_si256(real, imag);
279
280 _mm256_storeu_si256((__m256i*)_out, result);
281
282 32766 _in_a += 8;
283 32766 _in_b += 8;
284 32766 _out += 8;
285 }
286
287 2 number = avx2_points * 8;
288
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
289 14 *_out++ = (*_in_a++) * (*_in_b++);
290 }
291 2 }
292 #endif /* LV_HAVE_AVX2 */
293
294
295 #ifdef LV_HAVE_AVX2
296 #include <immintrin.h>
297
298 2 static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
299 const lv_16sc_t* in_a,
300 const lv_16sc_t* in_b,
301 unsigned int num_points)
302 {
303 2 unsigned int number = 0;
304 2 const unsigned int avx2_points = num_points / 8;
305
306 2 const lv_16sc_t* _in_a = in_a;
307 2 const lv_16sc_t* _in_b = in_b;
308 2 lv_16sc_t* _out = out;
309
310 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
311
312 2 const __m256i mask_imag = _mm256_set_epi8(0xFF,
313 0xFF,
314 0,
315 0,
316 0xFF,
317 0xFF,
318 0,
319 0,
320 0xFF,
321 0xFF,
322 0,
323 0,
324 0xFF,
325 0xFF,
326 0,
327 0,
328 0xFF,
329 0xFF,
330 0,
331 0,
332 0xFF,
333 0xFF,
334 0,
335 0,
336 0xFF,
337 0xFF,
338 0,
339 0,
340 0xFF,
341 0xFF,
342 0,
343 0);
344 2 const __m256i mask_real = _mm256_set_epi8(0,
345 0,
346 0xFF,
347 0xFF,
348 0,
349 0,
350 0xFF,
351 0xFF,
352 0,
353 0,
354 0xFF,
355 0xFF,
356 0,
357 0,
358 0xFF,
359 0xFF,
360 0,
361 0,
362 0xFF,
363 0xFF,
364 0,
365 0,
366 0xFF,
367 0xFF,
368 0,
369 0,
370 0xFF,
371 0xFF,
372 0,
373 0,
374 0xFF,
375 0xFF);
376
377
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < avx2_points; number++) {
378 32766 a = _mm256_load_si256(
379 (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
380 32766 b = _mm256_load_si256(
381 (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
382 32766 c = _mm256_mullo_epi16(a, b);
383
384 32766 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
385 // zeros, and store the results in dst.
386 32766 real = _mm256_subs_epi16(c, c_sr);
387 32766 real = _mm256_and_si256(
388 real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
389
390 32766 b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
391 32766 a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
392
393 32766 imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
394 32766 imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
395
396 32766 imag = _mm256_adds_epi16(imag1, imag2);
397 32766 imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
398
399 32766 result = _mm256_or_si256(real, imag);
400
401 _mm256_store_si256((__m256i*)_out, result);
402
403 32766 _in_a += 8;
404 32766 _in_b += 8;
405 32766 _out += 8;
406 }
407
408 2 number = avx2_points * 8;
409
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
410 14 *_out++ = (*_in_a++) * (*_in_b++);
411 }
412 2 }
413 #endif /* LV_HAVE_AVX2 */
414
415
416 #ifdef LV_HAVE_NEON
417 #include <arm_neon.h>
418
419 static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
420 const lv_16sc_t* in_a,
421 const lv_16sc_t* in_b,
422 unsigned int num_points)
423 {
424 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
425 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
426 unsigned int quarter_points = num_points / 4;
427 int16x4x2_t a_val, b_val, c_val;
428 int16x4x2_t tmp_real, tmp_imag;
429 unsigned int number = 0;
430
431 for (number = 0; number < quarter_points; ++number) {
432 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
433 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
434 __VOLK_PREFETCH(a_ptr + 4);
435 __VOLK_PREFETCH(b_ptr + 4);
436
437 // multiply the real*real and imag*imag to get real result
438 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
439 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
440 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
441 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
442
443 // Multiply cross terms to get the imaginary result
444 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
445 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
446 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
447 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
448
449 // store the results
450 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452 vst2_s16((int16_t*)out, c_val);
453
454 a_ptr += 4;
455 b_ptr += 4;
456 out += 4;
457 }
458
459 for (number = quarter_points * 4; number < num_points; number++) {
460 *out++ = (*a_ptr++) * (*b_ptr++);
461 }
462 }
463 #endif /* LV_HAVE_NEON */
464
465 #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
466