GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16i_32fc_dot_prod_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 314 314 100.0%
Functions: 7 7 100.0%
Branches: 28 28 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16i_32fc_dot_prod_32fc
12 *
13 * \b Overview
14 *
15 * This block computes the dot product (or inner product) between two
16 * vectors, the \p input and \p taps vectors. Given a set of \p
17 * num_points taps, the result is the sum of products between the two
18 * vectors. The result is a single value stored in the \p result
19 * address and will be complex.
20 *
21 * <b>Dispatcher Prototype</b>
22 * \code
23 * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t
24 * * taps, unsigned int num_points) \endcode
25 *
26 * \b Inputs
27 * \li input: vector of shorts.
28 * \li taps: complex taps.
29 * \li num_points: number of samples in both \p input and \p taps.
30 *
31 * \b Outputs
32 * \li result: pointer to a complex value to hold the dot product result.
33 *
34 * \b Example
35 * \code
36 * int N = 10000;
37 *
38 * <FIXME>
39 *
40 * volk_16i_32fc_dot_prod_32fc();
41 *
42 * \endcode
43 */
44
45 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47
48 #include <stdio.h>
49 #include <volk/volk_common.h>
50
51
52 #ifdef LV_HAVE_GENERIC
53
54 2 static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result,
55 const short* input,
56 const lv_32fc_t* taps,
57 unsigned int num_points)
58 {
59
60 static const int N_UNROLL = 4;
61
62 2 lv_32fc_t acc0 = 0;
63 2 lv_32fc_t acc1 = 0;
64 2 lv_32fc_t acc2 = 0;
65 2 lv_32fc_t acc3 = 0;
66
67 2 unsigned i = 0;
68 2 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69
70
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (i = 0; i < n; i += N_UNROLL) {
71 65534 acc0 += taps[i + 0] * (float)input[i + 0];
72 65534 acc1 += taps[i + 1] * (float)input[i + 1];
73 65534 acc2 += taps[i + 2] * (float)input[i + 2];
74 65534 acc3 += taps[i + 3] * (float)input[i + 3];
75 }
76
77
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; i < num_points; i++) {
78 6 acc0 += taps[i] * (float)input[i];
79 }
80
81 2 *result = acc0 + acc1 + acc2 + acc3;
82 2 }
83
84 #endif /*LV_HAVE_GENERIC*/
85
86 #ifdef LV_HAVE_NEON
87 #include <arm_neon.h>
88 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
89 const short* input,
90 const lv_32fc_t* taps,
91 unsigned int num_points)
92 {
93
94 unsigned ii;
95 unsigned quarter_points = num_points / 4;
96 lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97 short* inputPtr = (short*)input;
98 lv_32fc_t accumulator_vec[4];
99
100 float32x4x2_t tapsVal, accumulator_val;
101 int16x4_t input16;
102 int32x4_t input32;
103 float32x4_t input_float, prod_re, prod_im;
104
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
107
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
111 // widen 16-bit int to 32-bit int
112 input32 = vmovl_s16(input16);
113 // convert 32-bit int to float with scale
114 input_float = vcvtq_f32_s32(input32);
115
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121
122 tapsPtr += 4;
123 inputPtr += 4;
124 }
125 vst2q_f32((float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
129
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132 }
133
134 *result = accumulator_vec[0];
135 }
136
137 #endif /*LV_HAVE_NEON*/
138
139 #if LV_HAVE_SSE && LV_HAVE_MMX
140
141 2 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
142 const short* input,
143 const lv_32fc_t* taps,
144 unsigned int num_points)
145 {
146
147 2 unsigned int number = 0;
148 2 const unsigned int eighthPoints = num_points / 8;
149
150 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
151 2 const short* aPtr = input;
152 2 const float* bPtr = (float*)taps;
153
154 __m64 m0, m1;
155 __m128 f0, f1, f2, f3;
156 __m128 a0Val, a1Val, a2Val, a3Val;
157 __m128 b0Val, b1Val, b2Val, b3Val;
158 __m128 c0Val, c1Val, c2Val, c3Val;
159
160 2 __m128 dotProdVal0 = _mm_setzero_ps();
161 2 __m128 dotProdVal1 = _mm_setzero_ps();
162 2 __m128 dotProdVal2 = _mm_setzero_ps();
163 2 __m128 dotProdVal3 = _mm_setzero_ps();
164
165
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
166
167 32766 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
168 65532 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
169 32766 f0 = _mm_cvtpi16_ps(m0);
170 32766 f1 = _mm_cvtpi16_ps(m0);
171 32766 f2 = _mm_cvtpi16_ps(m1);
172 32766 f3 = _mm_cvtpi16_ps(m1);
173
174 32766 a0Val = _mm_unpacklo_ps(f0, f1);
175 32766 a1Val = _mm_unpackhi_ps(f0, f1);
176 32766 a2Val = _mm_unpacklo_ps(f2, f3);
177 32766 a3Val = _mm_unpackhi_ps(f2, f3);
178
179 32766 b0Val = _mm_loadu_ps(bPtr);
180 32766 b1Val = _mm_loadu_ps(bPtr + 4);
181 32766 b2Val = _mm_loadu_ps(bPtr + 8);
182 65532 b3Val = _mm_loadu_ps(bPtr + 12);
183
184 32766 c0Val = _mm_mul_ps(a0Val, b0Val);
185 32766 c1Val = _mm_mul_ps(a1Val, b1Val);
186 32766 c2Val = _mm_mul_ps(a2Val, b2Val);
187 32766 c3Val = _mm_mul_ps(a3Val, b3Val);
188
189 32766 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
190 32766 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
191 32766 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
192 32766 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
193
194 32766 aPtr += 8;
195 32766 bPtr += 16;
196 }
197
198 _mm_empty(); // clear the mmx technology state
199
200 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
203
204 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
205
206 _mm_store_ps(dotProductVector,
207 dotProdVal0); // Store the results back into the dot product vector
208
209 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
210 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
211
212 2 number = eighthPoints * 8;
213
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
214 14 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
215 14 aPtr += 1;
216 14 bPtr += 2;
217 }
218
219 2 *result = returnValue;
220 2 }
221
222 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
223
224
225 #if LV_HAVE_AVX2 && LV_HAVE_FMA
226
227 2 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
228 const short* input,
229 const lv_32fc_t* taps,
230 unsigned int num_points)
231 {
232
233 2 unsigned int number = 0;
234 2 const unsigned int sixteenthPoints = num_points / 16;
235
236 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
237 2 const short* aPtr = input;
238 2 const float* bPtr = (float*)taps;
239
240 __m128i m0, m1;
241 __m256i f0, f1;
242 __m256 g0, g1, h0, h1, h2, h3;
243 __m256 a0Val, a1Val, a2Val, a3Val;
244 __m256 b0Val, b1Val, b2Val, b3Val;
245
246 2 __m256 dotProdVal0 = _mm256_setzero_ps();
247 2 __m256 dotProdVal1 = _mm256_setzero_ps();
248 2 __m256 dotProdVal2 = _mm256_setzero_ps();
249 2 __m256 dotProdVal3 = _mm256_setzero_ps();
250
251
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
252
253 16382 m0 = _mm_loadu_si128((__m128i const*)aPtr);
254 32764 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
255
256 16382 f0 = _mm256_cvtepi16_epi32(m0);
257 16382 g0 = _mm256_cvtepi32_ps(f0);
258 16382 f1 = _mm256_cvtepi16_epi32(m1);
259 16382 g1 = _mm256_cvtepi32_ps(f1);
260
261 16382 h0 = _mm256_unpacklo_ps(g0, g0);
262 16382 h1 = _mm256_unpackhi_ps(g0, g0);
263 16382 h2 = _mm256_unpacklo_ps(g1, g1);
264 16382 h3 = _mm256_unpackhi_ps(g1, g1);
265
266 16382 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
267 16382 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
268 16382 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
269 16382 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
270
271 16382 b0Val = _mm256_loadu_ps(bPtr);
272 16382 b1Val = _mm256_loadu_ps(bPtr + 8);
273 16382 b2Val = _mm256_loadu_ps(bPtr + 16);
274 32764 b3Val = _mm256_loadu_ps(bPtr + 24);
275
276 16382 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
277 16382 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
278 16382 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
279 16382 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
280
281 16382 aPtr += 16;
282 16382 bPtr += 32;
283 }
284
285 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
286 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
287 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
288
289 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
290
291 _mm256_store_ps(dotProductVector,
292 dotProdVal0); // Store the results back into the dot product vector
293
294 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
295 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
296 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
297 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
298
299 2 number = sixteenthPoints * 16;
300
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
301 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
302 30 aPtr += 1;
303 30 bPtr += 2;
304 }
305
306 2 *result = returnValue;
307 2 }
308
309 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
310
311
312 #ifdef LV_HAVE_AVX2
313
314 2 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
315 const short* input,
316 const lv_32fc_t* taps,
317 unsigned int num_points)
318 {
319
320 2 unsigned int number = 0;
321 2 const unsigned int sixteenthPoints = num_points / 16;
322
323 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
324 2 const short* aPtr = input;
325 2 const float* bPtr = (float*)taps;
326
327 __m128i m0, m1;
328 __m256i f0, f1;
329 __m256 g0, g1, h0, h1, h2, h3;
330 __m256 a0Val, a1Val, a2Val, a3Val;
331 __m256 b0Val, b1Val, b2Val, b3Val;
332 __m256 c0Val, c1Val, c2Val, c3Val;
333
334 2 __m256 dotProdVal0 = _mm256_setzero_ps();
335 2 __m256 dotProdVal1 = _mm256_setzero_ps();
336 2 __m256 dotProdVal2 = _mm256_setzero_ps();
337 2 __m256 dotProdVal3 = _mm256_setzero_ps();
338
339
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
340
341 16382 m0 = _mm_loadu_si128((__m128i const*)aPtr);
342 32764 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
343
344 16382 f0 = _mm256_cvtepi16_epi32(m0);
345 16382 g0 = _mm256_cvtepi32_ps(f0);
346 16382 f1 = _mm256_cvtepi16_epi32(m1);
347 16382 g1 = _mm256_cvtepi32_ps(f1);
348
349 16382 h0 = _mm256_unpacklo_ps(g0, g0);
350 16382 h1 = _mm256_unpackhi_ps(g0, g0);
351 16382 h2 = _mm256_unpacklo_ps(g1, g1);
352 16382 h3 = _mm256_unpackhi_ps(g1, g1);
353
354 16382 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
355 16382 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
356 16382 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
357 16382 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
358
359 16382 b0Val = _mm256_loadu_ps(bPtr);
360 16382 b1Val = _mm256_loadu_ps(bPtr + 8);
361 16382 b2Val = _mm256_loadu_ps(bPtr + 16);
362 32764 b3Val = _mm256_loadu_ps(bPtr + 24);
363
364 16382 c0Val = _mm256_mul_ps(a0Val, b0Val);
365 16382 c1Val = _mm256_mul_ps(a1Val, b1Val);
366 16382 c2Val = _mm256_mul_ps(a2Val, b2Val);
367 16382 c3Val = _mm256_mul_ps(a3Val, b3Val);
368
369 16382 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
370 16382 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
371 16382 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
372 16382 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
373
374 16382 aPtr += 16;
375 16382 bPtr += 32;
376 }
377
378 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
379 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
380 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
381
382 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
383
384 _mm256_store_ps(dotProductVector,
385 dotProdVal0); // Store the results back into the dot product vector
386
387 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
388 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
389 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
390 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
391
392 2 number = sixteenthPoints * 16;
393
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
394 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
395 30 aPtr += 1;
396 30 bPtr += 2;
397 }
398
399 2 *result = returnValue;
400 2 }
401
402 #endif /*LV_HAVE_AVX2*/
403
404
405 #if LV_HAVE_SSE && LV_HAVE_MMX
406
407
408 2 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
409 const short* input,
410 const lv_32fc_t* taps,
411 unsigned int num_points)
412 {
413
414 2 unsigned int number = 0;
415 2 const unsigned int eighthPoints = num_points / 8;
416
417 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
418 2 const short* aPtr = input;
419 2 const float* bPtr = (float*)taps;
420
421 __m64 m0, m1;
422 __m128 f0, f1, f2, f3;
423 __m128 a0Val, a1Val, a2Val, a3Val;
424 __m128 b0Val, b1Val, b2Val, b3Val;
425 __m128 c0Val, c1Val, c2Val, c3Val;
426
427 2 __m128 dotProdVal0 = _mm_setzero_ps();
428 2 __m128 dotProdVal1 = _mm_setzero_ps();
429 2 __m128 dotProdVal2 = _mm_setzero_ps();
430 2 __m128 dotProdVal3 = _mm_setzero_ps();
431
432
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
433
434 32766 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
435 65532 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
436 32766 f0 = _mm_cvtpi16_ps(m0);
437 32766 f1 = _mm_cvtpi16_ps(m0);
438 32766 f2 = _mm_cvtpi16_ps(m1);
439 32766 f3 = _mm_cvtpi16_ps(m1);
440
441 32766 a0Val = _mm_unpacklo_ps(f0, f1);
442 32766 a1Val = _mm_unpackhi_ps(f0, f1);
443 32766 a2Val = _mm_unpacklo_ps(f2, f3);
444 32766 a3Val = _mm_unpackhi_ps(f2, f3);
445
446 32766 b0Val = _mm_load_ps(bPtr);
447 32766 b1Val = _mm_load_ps(bPtr + 4);
448 32766 b2Val = _mm_load_ps(bPtr + 8);
449 65532 b3Val = _mm_load_ps(bPtr + 12);
450
451 32766 c0Val = _mm_mul_ps(a0Val, b0Val);
452 32766 c1Val = _mm_mul_ps(a1Val, b1Val);
453 32766 c2Val = _mm_mul_ps(a2Val, b2Val);
454 32766 c3Val = _mm_mul_ps(a3Val, b3Val);
455
456 32766 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
457 32766 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
458 32766 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
459 32766 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
460
461 32766 aPtr += 8;
462 32766 bPtr += 16;
463 }
464
465 _mm_empty(); // clear the mmx technology state
466
467 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
468 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
469 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
470
471 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
472
473 _mm_store_ps(dotProductVector,
474 dotProdVal0); // Store the results back into the dot product vector
475
476 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
477 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
478
479 2 number = eighthPoints * 8;
480
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
481 14 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
482 14 aPtr += 1;
483 14 bPtr += 2;
484 }
485
486 2 *result = returnValue;
487 2 }
488
489 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
490
491 #ifdef LV_HAVE_AVX2
492
493 2 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
494 const short* input,
495 const lv_32fc_t* taps,
496 unsigned int num_points)
497 {
498
499 2 unsigned int number = 0;
500 2 const unsigned int sixteenthPoints = num_points / 16;
501
502 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
503 2 const short* aPtr = input;
504 2 const float* bPtr = (float*)taps;
505
506 __m128i m0, m1;
507 __m256i f0, f1;
508 __m256 g0, g1, h0, h1, h2, h3;
509 __m256 a0Val, a1Val, a2Val, a3Val;
510 __m256 b0Val, b1Val, b2Val, b3Val;
511 __m256 c0Val, c1Val, c2Val, c3Val;
512
513 2 __m256 dotProdVal0 = _mm256_setzero_ps();
514 2 __m256 dotProdVal1 = _mm256_setzero_ps();
515 2 __m256 dotProdVal2 = _mm256_setzero_ps();
516 2 __m256 dotProdVal3 = _mm256_setzero_ps();
517
518
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
519
520 16382 m0 = _mm_load_si128((__m128i const*)aPtr);
521 32764 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
522
523 16382 f0 = _mm256_cvtepi16_epi32(m0);
524 16382 g0 = _mm256_cvtepi32_ps(f0);
525 16382 f1 = _mm256_cvtepi16_epi32(m1);
526 16382 g1 = _mm256_cvtepi32_ps(f1);
527
528 16382 h0 = _mm256_unpacklo_ps(g0, g0);
529 16382 h1 = _mm256_unpackhi_ps(g0, g0);
530 16382 h2 = _mm256_unpacklo_ps(g1, g1);
531 16382 h3 = _mm256_unpackhi_ps(g1, g1);
532
533 16382 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
534 16382 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
535 16382 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
536 16382 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
537
538 16382 b0Val = _mm256_load_ps(bPtr);
539 16382 b1Val = _mm256_load_ps(bPtr + 8);
540 16382 b2Val = _mm256_load_ps(bPtr + 16);
541 32764 b3Val = _mm256_load_ps(bPtr + 24);
542
543 16382 c0Val = _mm256_mul_ps(a0Val, b0Val);
544 16382 c1Val = _mm256_mul_ps(a1Val, b1Val);
545 16382 c2Val = _mm256_mul_ps(a2Val, b2Val);
546 16382 c3Val = _mm256_mul_ps(a3Val, b3Val);
547
548 16382 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
549 16382 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
550 16382 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
551 16382 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
552
553 16382 aPtr += 16;
554 16382 bPtr += 32;
555 }
556
557 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
558 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
559 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
560
561 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
562
563 _mm256_store_ps(dotProductVector,
564 dotProdVal0); // Store the results back into the dot product vector
565
566 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
567 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
568 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
569 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
570
571 2 number = sixteenthPoints * 16;
572
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
573 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
574 30 aPtr += 1;
575 30 bPtr += 2;
576 }
577
578 2 *result = returnValue;
579 2 }
580
581
582 #endif /*LV_HAVE_AVX2*/
583
584 #if LV_HAVE_AVX2 && LV_HAVE_FMA
585
586 2 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
587 const short* input,
588 const lv_32fc_t* taps,
589 unsigned int num_points)
590 {
591
592 2 unsigned int number = 0;
593 2 const unsigned int sixteenthPoints = num_points / 16;
594
595 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
596 2 const short* aPtr = input;
597 2 const float* bPtr = (float*)taps;
598
599 __m128i m0, m1;
600 __m256i f0, f1;
601 __m256 g0, g1, h0, h1, h2, h3;
602 __m256 a0Val, a1Val, a2Val, a3Val;
603 __m256 b0Val, b1Val, b2Val, b3Val;
604
605 2 __m256 dotProdVal0 = _mm256_setzero_ps();
606 2 __m256 dotProdVal1 = _mm256_setzero_ps();
607 2 __m256 dotProdVal2 = _mm256_setzero_ps();
608 2 __m256 dotProdVal3 = _mm256_setzero_ps();
609
610
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
611
612 16382 m0 = _mm_load_si128((__m128i const*)aPtr);
613 32764 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
614
615 16382 f0 = _mm256_cvtepi16_epi32(m0);
616 16382 g0 = _mm256_cvtepi32_ps(f0);
617 16382 f1 = _mm256_cvtepi16_epi32(m1);
618 16382 g1 = _mm256_cvtepi32_ps(f1);
619
620 16382 h0 = _mm256_unpacklo_ps(g0, g0);
621 16382 h1 = _mm256_unpackhi_ps(g0, g0);
622 16382 h2 = _mm256_unpacklo_ps(g1, g1);
623 16382 h3 = _mm256_unpackhi_ps(g1, g1);
624
625 16382 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
626 16382 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
627 16382 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
628 16382 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
629
630 16382 b0Val = _mm256_load_ps(bPtr);
631 16382 b1Val = _mm256_load_ps(bPtr + 8);
632 16382 b2Val = _mm256_load_ps(bPtr + 16);
633 32764 b3Val = _mm256_load_ps(bPtr + 24);
634
635 16382 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
636 16382 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
637 16382 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
638 16382 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
639
640 16382 aPtr += 16;
641 16382 bPtr += 32;
642 }
643
644 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
645 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
646 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
647
648 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
649
650 _mm256_store_ps(dotProductVector,
651 dotProdVal0); // Store the results back into the dot product vector
652
653 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
654 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
655 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
656 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
657
658 2 number = sixteenthPoints * 16;
659
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
660 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
661 30 aPtr += 1;
662 30 bPtr += 2;
663 }
664
665 2 *result = returnValue;
666 2 }
667
668
669 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
670
671
672 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
673