GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16ic_x2_dot_prod_16ic.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 158 158 100.0%
Functions: 5 5 100.0%
Branches: 30 34 88.2%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16ic_x2_dot_prod_16ic
12 *
13 * \b Overview
14 *
15 * Multiplies two input complex vectors (16-bit integer each component) and accumulates
16 * them, storing the result. Results are saturated so never go beyond the limits of the
17 * data type.
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
22 * lv_16sc_t* in_b, unsigned int num_points); \endcode
23 *
24 * \b Inputs
25 * \li in_a: One of the vectors to be multiplied and accumulated.
26 * \li in_b: The other vector to be multiplied and accumulated.
27 * \li num_points: Number of complex values to be multiplied together, accumulated and
28 * stored into \p result
29 *
30 * \b Outputs
31 * \li result: Value of the accumulated result.
32 *
33 */
34
35 #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
36 #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
37
38 #include <volk/saturation_arithmetic.h>
39 #include <volk/volk_common.h>
40 #include <volk/volk_complex.h>
41
42
43 #ifdef LV_HAVE_GENERIC
44
45 2 static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
46 const lv_16sc_t* in_a,
47 const lv_16sc_t* in_b,
48 unsigned int num_points)
49 {
50 2 result[0] = lv_cmake((int16_t)0, (int16_t)0);
51 unsigned int n;
52
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (n = 0; n < num_points; n++) {
53 262142 lv_16sc_t tmp = in_a[n] * in_b[n];
54 262142 result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
55 sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
56 }
57 2 }
58
59 #endif /*LV_HAVE_GENERIC*/
60
61
62 #ifdef LV_HAVE_SSE2
63 #include <emmintrin.h>
64
65 2 static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
66 const lv_16sc_t* in_a,
67 const lv_16sc_t* in_b,
68 unsigned int num_points)
69 {
70 2 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
71
72 2 const unsigned int sse_iters = num_points / 4;
73 unsigned int number;
74
75 2 const lv_16sc_t* _in_a = in_a;
76 2 const lv_16sc_t* _in_b = in_b;
77 2 lv_16sc_t* _out = out;
78
79
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (sse_iters > 0) {
80 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
81 realcacc, imagcacc;
82 __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
83
84 2 realcacc = _mm_setzero_si128();
85 2 imagcacc = _mm_setzero_si128();
86
87 2 mask_imag = _mm_set_epi8(
88 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
89 2 mask_real = _mm_set_epi8(
90 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
91
92
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (number = 0; number < sse_iters; number++) {
93 // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
94 65534 a = _mm_load_si128(
95 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
96 65534 __VOLK_PREFETCH(_in_a + 8);
97 65534 b = _mm_load_si128((__m128i*)_in_b);
98 65534 __VOLK_PREFETCH(_in_b + 8);
99 65534 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
100
101 65534 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
102 // zeros, and store the results in dst.
103 65534 real = _mm_subs_epi16(c, c_sr);
104
105 65534 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
106 65534 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
107
108 65534 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
109 65534 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
110
111 65534 imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
112
113 65534 realcacc = _mm_adds_epi16(realcacc, real);
114 65534 imagcacc = _mm_adds_epi16(imagcacc, imag);
115
116 65534 _in_a += 4;
117 65534 _in_b += 4;
118 }
119
120 2 realcacc = _mm_and_si128(realcacc, mask_real);
121 2 imagcacc = _mm_and_si128(imagcacc, mask_imag);
122
123 2 a = _mm_or_si128(realcacc, imagcacc);
124
125 _mm_store_si128((__m128i*)dotProductVector,
126 a); // Store the results back into the dot product vector
127
128
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (number = 0; number < 4; ++number) {
129 8 dotProduct = lv_cmake(
130 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
131 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
132 }
133 }
134
135
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (number = 0; number < (num_points % 4); ++number) {
136 6 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
137 6 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
138 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
139 }
140
141 2 *_out = dotProduct;
142 2 }
143
144 #endif /* LV_HAVE_SSE2 */
145
146
147 #ifdef LV_HAVE_SSE2
148 #include <emmintrin.h>
149
150 2 static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
151 const lv_16sc_t* in_a,
152 const lv_16sc_t* in_b,
153 unsigned int num_points)
154 {
155 2 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
156
157 2 const unsigned int sse_iters = num_points / 4;
158
159 2 const lv_16sc_t* _in_a = in_a;
160 2 const lv_16sc_t* _in_b = in_b;
161 2 lv_16sc_t* _out = out;
162 unsigned int number;
163
164
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (sse_iters > 0) {
165 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
166 realcacc, imagcacc, result;
167 __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
168
169 2 realcacc = _mm_setzero_si128();
170 2 imagcacc = _mm_setzero_si128();
171
172 2 mask_imag = _mm_set_epi8(
173 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
174 2 mask_real = _mm_set_epi8(
175 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
176
177
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (number = 0; number < sse_iters; number++) {
178 // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
179 65534 a = _mm_loadu_si128(
180 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
181 65534 __VOLK_PREFETCH(_in_a + 8);
182 65534 b = _mm_loadu_si128((__m128i*)_in_b);
183 65534 __VOLK_PREFETCH(_in_b + 8);
184 65534 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
185
186 65534 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
187 // zeros, and store the results in dst.
188 65534 real = _mm_subs_epi16(c, c_sr);
189
190 65534 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
191 65534 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
192
193 65534 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
194 65534 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
195
196 65534 imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
197
198 65534 realcacc = _mm_adds_epi16(realcacc, real);
199 65534 imagcacc = _mm_adds_epi16(imagcacc, imag);
200
201 65534 _in_a += 4;
202 65534 _in_b += 4;
203 }
204
205 2 realcacc = _mm_and_si128(realcacc, mask_real);
206 2 imagcacc = _mm_and_si128(imagcacc, mask_imag);
207
208 2 result = _mm_or_si128(realcacc, imagcacc);
209
210 _mm_storeu_si128((__m128i*)dotProductVector,
211 result); // Store the results back into the dot product vector
212
213
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (number = 0; number < 4; ++number) {
214 8 dotProduct = lv_cmake(
215 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
216 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
217 }
218 }
219
220
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (number = 0; number < (num_points % 4); ++number) {
221 6 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
222 6 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
223 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
224 }
225
226 2 *_out = dotProduct;
227 2 }
228 #endif /* LV_HAVE_SSE2 */
229
230
231 #ifdef LV_HAVE_AVX2
232 #include <immintrin.h>
233
234 2 static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
235 const lv_16sc_t* in_a,
236 const lv_16sc_t* in_b,
237 unsigned int num_points)
238 {
239 2 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
240
241 2 const unsigned int avx_iters = num_points / 8;
242
243 2 const lv_16sc_t* _in_a = in_a;
244 2 const lv_16sc_t* _in_b = in_b;
245 2 lv_16sc_t* _out = out;
246 unsigned int number;
247
248
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (avx_iters > 0) {
249 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
250 realcacc, imagcacc, result;
251 __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
252
253 2 realcacc = _mm256_setzero_si256();
254 2 imagcacc = _mm256_setzero_si256();
255
256 2 mask_imag = _mm256_set_epi8(0xFF,
257 0xFF,
258 0,
259 0,
260 0xFF,
261 0xFF,
262 0,
263 0,
264 0xFF,
265 0xFF,
266 0,
267 0,
268 0xFF,
269 0xFF,
270 0,
271 0,
272 0xFF,
273 0xFF,
274 0,
275 0,
276 0xFF,
277 0xFF,
278 0,
279 0,
280 0xFF,
281 0xFF,
282 0,
283 0,
284 0xFF,
285 0xFF,
286 0,
287 0);
288 2 mask_real = _mm256_set_epi8(0,
289 0,
290 0xFF,
291 0xFF,
292 0,
293 0,
294 0xFF,
295 0xFF,
296 0,
297 0,
298 0xFF,
299 0xFF,
300 0,
301 0,
302 0xFF,
303 0xFF,
304 0,
305 0,
306 0xFF,
307 0xFF,
308 0,
309 0,
310 0xFF,
311 0xFF,
312 0,
313 0,
314 0xFF,
315 0xFF,
316 0,
317 0,
318 0xFF,
319 0xFF);
320
321
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < avx_iters; number++) {
322 32766 a = _mm256_loadu_si256((__m256i*)_in_a);
323 32766 __VOLK_PREFETCH(_in_a + 16);
324 32766 b = _mm256_loadu_si256((__m256i*)_in_b);
325 32766 __VOLK_PREFETCH(_in_b + 16);
326 32766 c = _mm256_mullo_epi16(a, b);
327
328 32766 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
329 // in zeros, and store the results in dst.
330 32766 real = _mm256_subs_epi16(c, c_sr);
331
332 32766 b_sl = _mm256_slli_si256(b, 2);
333 32766 a_sl = _mm256_slli_si256(a, 2);
334
335 32766 imag1 = _mm256_mullo_epi16(a, b_sl);
336 32766 imag2 = _mm256_mullo_epi16(b, a_sl);
337
338 32766 imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
339
340 32766 realcacc = _mm256_adds_epi16(realcacc, real);
341 32766 imagcacc = _mm256_adds_epi16(imagcacc, imag);
342
343 32766 _in_a += 8;
344 32766 _in_b += 8;
345 }
346
347 2 realcacc = _mm256_and_si256(realcacc, mask_real);
348 2 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
349
350 2 result = _mm256_or_si256(realcacc, imagcacc);
351
352 _mm256_storeu_si256((__m256i*)dotProductVector,
353 result); // Store the results back into the dot product vector
354
355
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (number = 0; number < 8; ++number) {
356 16 dotProduct = lv_cmake(
357 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
358 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
359 }
360 }
361
362
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (number = 0; number < (num_points % 8); ++number) {
363 14 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
364 14 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
365 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
366 }
367
368 2 *_out = dotProduct;
369 2 }
370 #endif /* LV_HAVE_AVX2 */
371
372
373 #ifdef LV_HAVE_AVX2
374 #include <immintrin.h>
375
376 2 static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
377 const lv_16sc_t* in_a,
378 const lv_16sc_t* in_b,
379 unsigned int num_points)
380 {
381 2 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
382
383 2 const unsigned int avx_iters = num_points / 8;
384
385 2 const lv_16sc_t* _in_a = in_a;
386 2 const lv_16sc_t* _in_b = in_b;
387 2 lv_16sc_t* _out = out;
388 unsigned int number;
389
390
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (avx_iters > 0) {
391 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
392 realcacc, imagcacc, result;
393 __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
394
395 2 realcacc = _mm256_setzero_si256();
396 2 imagcacc = _mm256_setzero_si256();
397
398 2 mask_imag = _mm256_set_epi8(0xFF,
399 0xFF,
400 0,
401 0,
402 0xFF,
403 0xFF,
404 0,
405 0,
406 0xFF,
407 0xFF,
408 0,
409 0,
410 0xFF,
411 0xFF,
412 0,
413 0,
414 0xFF,
415 0xFF,
416 0,
417 0,
418 0xFF,
419 0xFF,
420 0,
421 0,
422 0xFF,
423 0xFF,
424 0,
425 0,
426 0xFF,
427 0xFF,
428 0,
429 0);
430 2 mask_real = _mm256_set_epi8(0,
431 0,
432 0xFF,
433 0xFF,
434 0,
435 0,
436 0xFF,
437 0xFF,
438 0,
439 0,
440 0xFF,
441 0xFF,
442 0,
443 0,
444 0xFF,
445 0xFF,
446 0,
447 0,
448 0xFF,
449 0xFF,
450 0,
451 0,
452 0xFF,
453 0xFF,
454 0,
455 0,
456 0xFF,
457 0xFF,
458 0,
459 0,
460 0xFF,
461 0xFF);
462
463
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < avx_iters; number++) {
464 32766 a = _mm256_load_si256((__m256i*)_in_a);
465 32766 __VOLK_PREFETCH(_in_a + 16);
466 32766 b = _mm256_load_si256((__m256i*)_in_b);
467 32766 __VOLK_PREFETCH(_in_b + 16);
468 32766 c = _mm256_mullo_epi16(a, b);
469
470 32766 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
471 // in zeros, and store the results in dst.
472 32766 real = _mm256_subs_epi16(c, c_sr);
473
474 32766 b_sl = _mm256_slli_si256(b, 2);
475 32766 a_sl = _mm256_slli_si256(a, 2);
476
477 32766 imag1 = _mm256_mullo_epi16(a, b_sl);
478 32766 imag2 = _mm256_mullo_epi16(b, a_sl);
479
480 32766 imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
481
482 32766 realcacc = _mm256_adds_epi16(realcacc, real);
483 32766 imagcacc = _mm256_adds_epi16(imagcacc, imag);
484
485 32766 _in_a += 8;
486 32766 _in_b += 8;
487 }
488
489 2 realcacc = _mm256_and_si256(realcacc, mask_real);
490 2 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
491
492 2 result = _mm256_or_si256(realcacc, imagcacc);
493
494 _mm256_store_si256((__m256i*)dotProductVector,
495 result); // Store the results back into the dot product vector
496
497
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
18 for (number = 0; number < 8; ++number) {
498 16 dotProduct = lv_cmake(
499 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
500 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
501 }
502 }
503
504
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (number = 0; number < (num_points % 8); ++number) {
505 14 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
506 14 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
507 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
508 }
509
510 2 *_out = dotProduct;
511 2 }
512 #endif /* LV_HAVE_AVX2 */
513
514
515 #ifdef LV_HAVE_NEON
516 #include <arm_neon.h>
517
518 static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
519 const lv_16sc_t* in_a,
520 const lv_16sc_t* in_b,
521 unsigned int num_points)
522 {
523 unsigned int quarter_points = num_points / 4;
524 unsigned int number;
525
526 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
527 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
528 *out = lv_cmake((int16_t)0, (int16_t)0);
529
530 if (quarter_points > 0) {
531 // for 2-lane vectors, 1st lane holds the real part,
532 // 2nd lane holds the imaginary part
533 int16x4x2_t a_val, b_val, c_val, accumulator;
534 int16x4x2_t tmp_real, tmp_imag;
535 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
536 accumulator.val[0] = vdup_n_s16(0);
537 accumulator.val[1] = vdup_n_s16(0);
538 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
539
540 for (number = 0; number < quarter_points; ++number) {
541 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
542 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
543 __VOLK_PREFETCH(a_ptr + 8);
544 __VOLK_PREFETCH(b_ptr + 8);
545
546 // multiply the real*real and imag*imag to get real result
547 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
548 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
549 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
550 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
551
552 // Multiply cross terms to get the imaginary result
553 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
554 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
555 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
556 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
557
558 c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
559 c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
560
561 accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
562 accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
563
564 a_ptr += 4;
565 b_ptr += 4;
566 }
567
568 vst2_s16((int16_t*)accum_result, accumulator);
569 for (number = 0; number < 4; ++number) {
570 dotProduct = lv_cmake(
571 sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
572 sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
573 }
574
575 *out = dotProduct;
576 }
577
578 // tail case
579 for (number = quarter_points * 4; number < num_points; ++number) {
580 *out += (*a_ptr++) * (*b_ptr++);
581 }
582 }
583
584 #endif /* LV_HAVE_NEON */
585
586
587 #ifdef LV_HAVE_NEON
588 #include <arm_neon.h>
589
590 static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
591 const lv_16sc_t* in_a,
592 const lv_16sc_t* in_b,
593 unsigned int num_points)
594 {
595 unsigned int quarter_points = num_points / 4;
596 unsigned int number;
597
598 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
599 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
600 // for 2-lane vectors, 1st lane holds the real part,
601 // 2nd lane holds the imaginary part
602 int16x4x2_t a_val, b_val, accumulator;
603 int16x4x2_t tmp;
604 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
605 accumulator.val[0] = vdup_n_s16(0);
606 accumulator.val[1] = vdup_n_s16(0);
607
608 for (number = 0; number < quarter_points; ++number) {
609 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
610 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
611 __VOLK_PREFETCH(a_ptr + 8);
612 __VOLK_PREFETCH(b_ptr + 8);
613
614 tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
615 tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
616
617 // use multiply accumulate/subtract to get result
618 tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
619 tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
620
621 accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
622 accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
623
624 a_ptr += 4;
625 b_ptr += 4;
626 }
627
628 vst2_s16((int16_t*)accum_result, accumulator);
629 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
630
631 // tail case
632 for (number = quarter_points * 4; number < num_points; ++number) {
633 *out += (*a_ptr++) * (*b_ptr++);
634 }
635 }
636
637 #endif /* LV_HAVE_NEON */
638
639
640 #ifdef LV_HAVE_NEON
641 #include <arm_neon.h>
642
643 static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
644 const lv_16sc_t* in_a,
645 const lv_16sc_t* in_b,
646 unsigned int num_points)
647 {
648 unsigned int quarter_points = num_points / 4;
649 unsigned int number;
650
651 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
652 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
653 // for 2-lane vectors, 1st lane holds the real part,
654 // 2nd lane holds the imaginary part
655 int16x4x2_t a_val, b_val, accumulator1, accumulator2;
656
657 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
658 accumulator1.val[0] = vdup_n_s16(0);
659 accumulator1.val[1] = vdup_n_s16(0);
660 accumulator2.val[0] = vdup_n_s16(0);
661 accumulator2.val[1] = vdup_n_s16(0);
662
663 for (number = 0; number < quarter_points; ++number) {
664 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
665 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
666 __VOLK_PREFETCH(a_ptr + 8);
667 __VOLK_PREFETCH(b_ptr + 8);
668
669 // use 2 accumulators to remove inter-instruction data dependencies
670 accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
671 accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
672 accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
673 accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
674
675 a_ptr += 4;
676 b_ptr += 4;
677 }
678
679 accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
680 accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
681
682 vst2_s16((int16_t*)accum_result, accumulator1);
683 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
684
685 // tail case
686 for (number = quarter_points * 4; number < num_points; ++number) {
687 *out += (*a_ptr++) * (*b_ptr++);
688 }
689 }
690
691 #endif /* LV_HAVE_NEON */
692
693 #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
694