GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_dot_prod_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 366 408 89.7%
Functions: 12 14 85.7%
Branches: 44 52 84.6%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_dot_prod_32f
12 *
13 * \b Overview
14 *
15 * This block computes the dot product (or inner product) between two
16 * vectors, the \p input and \p taps vectors. Given a set of \p
17 * num_points taps, the result is the sum of products between the two
18 * vectors. The result is a single value stored in the \p result
19 * address and is returned as a float.
20 *
21 * <b>Dispatcher Prototype</b>
22 * \code
23 * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps,
24 * unsigned int num_points) \endcode
25 *
26 * \b Inputs
27 * \li input: vector of floats.
28 * \li taps: float taps.
29 * \li num_points: number of samples in both \p input and \p taps.
30 *
31 * \b Outputs
32 * \li result: pointer to a float value to hold the dot product result.
33 *
34 * \b Example
35 * Take the dot product of an increasing vector and a vector of ones. The result is the
36 * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment();
37 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
38 * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
39 * float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
40 *
41 * for(unsigned int ii = 0; ii < N; ++ii){
42 * increasing[ii] = (float)ii;
43 * ones[ii] = 1.f;
44 * }
45 *
46 * volk_32f_x2_dot_prod_32f(out, increasing, ones, N);
47 *
48 * printf("out = %1.2f\n", *out);
49 *
50 * volk_free(increasing);
51 * volk_free(ones);
52 * volk_free(out);
53 *
54 * return 0;
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
60
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63
64
65 #ifdef LV_HAVE_GENERIC
66
67
68 2 static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
69 const float* input,
70 const float* taps,
71 unsigned int num_points)
72 {
73
74 2 float dotProduct = 0;
75 2 const float* aPtr = input;
76 2 const float* bPtr = taps;
77 2 unsigned int number = 0;
78
79
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
80 262142 dotProduct += ((*aPtr++) * (*bPtr++));
81 }
82
83 2 *result = dotProduct;
84 2 }
85
86 #endif /*LV_HAVE_GENERIC*/
87
88
89 #ifdef LV_HAVE_SSE
90
91
92 2 static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
93 const float* input,
94 const float* taps,
95 unsigned int num_points)
96 {
97
98 2 unsigned int number = 0;
99 2 const unsigned int sixteenthPoints = num_points / 16;
100
101 2 float dotProduct = 0;
102 2 const float* aPtr = input;
103 2 const float* bPtr = taps;
104
105 __m128 a0Val, a1Val, a2Val, a3Val;
106 __m128 b0Val, b1Val, b2Val, b3Val;
107 __m128 c0Val, c1Val, c2Val, c3Val;
108
109 2 __m128 dotProdVal0 = _mm_setzero_ps();
110 2 __m128 dotProdVal1 = _mm_setzero_ps();
111 2 __m128 dotProdVal2 = _mm_setzero_ps();
112 2 __m128 dotProdVal3 = _mm_setzero_ps();
113
114
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
115
116 16382 a0Val = _mm_loadu_ps(aPtr);
117 16382 a1Val = _mm_loadu_ps(aPtr + 4);
118 16382 a2Val = _mm_loadu_ps(aPtr + 8);
119 32764 a3Val = _mm_loadu_ps(aPtr + 12);
120 16382 b0Val = _mm_loadu_ps(bPtr);
121 16382 b1Val = _mm_loadu_ps(bPtr + 4);
122 16382 b2Val = _mm_loadu_ps(bPtr + 8);
123 32764 b3Val = _mm_loadu_ps(bPtr + 12);
124
125 16382 c0Val = _mm_mul_ps(a0Val, b0Val);
126 16382 c1Val = _mm_mul_ps(a1Val, b1Val);
127 16382 c2Val = _mm_mul_ps(a2Val, b2Val);
128 16382 c3Val = _mm_mul_ps(a3Val, b3Val);
129
130 16382 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
131 16382 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
132 16382 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
133 16382 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
134
135 16382 aPtr += 16;
136 16382 bPtr += 16;
137 }
138
139 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
140 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
141 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
142
143 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
144
145 _mm_store_ps(dotProductVector,
146 dotProdVal0); // Store the results back into the dot product vector
147
148 2 dotProduct = dotProductVector[0];
149 2 dotProduct += dotProductVector[1];
150 2 dotProduct += dotProductVector[2];
151 2 dotProduct += dotProductVector[3];
152
153 2 number = sixteenthPoints * 16;
154
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
155 30 dotProduct += ((*aPtr++) * (*bPtr++));
156 }
157
158 2 *result = dotProduct;
159 2 }
160
161 #endif /*LV_HAVE_SSE*/
162
163 #ifdef LV_HAVE_SSE3
164
165 #include <pmmintrin.h>
166
167 2 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
168 const float* input,
169 const float* taps,
170 unsigned int num_points)
171 {
172 2 unsigned int number = 0;
173 2 const unsigned int sixteenthPoints = num_points / 16;
174
175 2 float dotProduct = 0;
176 2 const float* aPtr = input;
177 2 const float* bPtr = taps;
178
179 __m128 a0Val, a1Val, a2Val, a3Val;
180 __m128 b0Val, b1Val, b2Val, b3Val;
181 __m128 c0Val, c1Val, c2Val, c3Val;
182
183 2 __m128 dotProdVal0 = _mm_setzero_ps();
184 2 __m128 dotProdVal1 = _mm_setzero_ps();
185 2 __m128 dotProdVal2 = _mm_setzero_ps();
186 2 __m128 dotProdVal3 = _mm_setzero_ps();
187
188
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
189
190 16382 a0Val = _mm_loadu_ps(aPtr);
191 16382 a1Val = _mm_loadu_ps(aPtr + 4);
192 16382 a2Val = _mm_loadu_ps(aPtr + 8);
193 32764 a3Val = _mm_loadu_ps(aPtr + 12);
194 16382 b0Val = _mm_loadu_ps(bPtr);
195 16382 b1Val = _mm_loadu_ps(bPtr + 4);
196 16382 b2Val = _mm_loadu_ps(bPtr + 8);
197 32764 b3Val = _mm_loadu_ps(bPtr + 12);
198
199 16382 c0Val = _mm_mul_ps(a0Val, b0Val);
200 16382 c1Val = _mm_mul_ps(a1Val, b1Val);
201 16382 c2Val = _mm_mul_ps(a2Val, b2Val);
202 16382 c3Val = _mm_mul_ps(a3Val, b3Val);
203
204 16382 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
205 16382 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
206 16382 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
207 16382 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
208
209 16382 aPtr += 16;
210 16382 bPtr += 16;
211 }
212
213 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
214 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
215 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
216
217 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
218 _mm_store_ps(dotProductVector,
219 dotProdVal0); // Store the results back into the dot product vector
220
221 2 dotProduct = dotProductVector[0];
222 2 dotProduct += dotProductVector[1];
223 2 dotProduct += dotProductVector[2];
224 2 dotProduct += dotProductVector[3];
225
226 2 number = sixteenthPoints * 16;
227
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
228 30 dotProduct += ((*aPtr++) * (*bPtr++));
229 }
230
231 2 *result = dotProduct;
232 2 }
233
234 #endif /*LV_HAVE_SSE3*/
235
236 #ifdef LV_HAVE_SSE4_1
237
238 #include <smmintrin.h>
239
240 2 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
241 const float* input,
242 const float* taps,
243 unsigned int num_points)
244 {
245 2 unsigned int number = 0;
246 2 const unsigned int sixteenthPoints = num_points / 16;
247
248 2 float dotProduct = 0;
249 2 const float* aPtr = input;
250 2 const float* bPtr = taps;
251
252 __m128 aVal1, bVal1, cVal1;
253 __m128 aVal2, bVal2, cVal2;
254 __m128 aVal3, bVal3, cVal3;
255 __m128 aVal4, bVal4, cVal4;
256
257 2 __m128 dotProdVal = _mm_setzero_ps();
258
259
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
260
261 16382 aVal1 = _mm_loadu_ps(aPtr);
262 16382 aPtr += 4;
263 16382 aVal2 = _mm_loadu_ps(aPtr);
264 16382 aPtr += 4;
265 16382 aVal3 = _mm_loadu_ps(aPtr);
266 16382 aPtr += 4;
267 16382 aVal4 = _mm_loadu_ps(aPtr);
268 16382 aPtr += 4;
269
270 16382 bVal1 = _mm_loadu_ps(bPtr);
271 16382 bPtr += 4;
272 16382 bVal2 = _mm_loadu_ps(bPtr);
273 16382 bPtr += 4;
274 16382 bVal3 = _mm_loadu_ps(bPtr);
275 16382 bPtr += 4;
276 16382 bVal4 = _mm_loadu_ps(bPtr);
277 16382 bPtr += 4;
278
279 16382 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
280 16382 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
281 16382 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
282 16382 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
283
284 16382 cVal1 = _mm_or_ps(cVal1, cVal2);
285 16382 cVal3 = _mm_or_ps(cVal3, cVal4);
286 16382 cVal1 = _mm_or_ps(cVal1, cVal3);
287
288 16382 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
289 }
290
291 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
292 _mm_store_ps(dotProductVector,
293 dotProdVal); // Store the results back into the dot product vector
294
295 2 dotProduct = dotProductVector[0];
296 2 dotProduct += dotProductVector[1];
297 2 dotProduct += dotProductVector[2];
298 2 dotProduct += dotProductVector[3];
299
300 2 number = sixteenthPoints * 16;
301
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
302 30 dotProduct += ((*aPtr++) * (*bPtr++));
303 }
304
305 2 *result = dotProduct;
306 2 }
307
308 #endif /*LV_HAVE_SSE4_1*/
309
310 #ifdef LV_HAVE_AVX
311
312 #include <immintrin.h>
313
314 2 static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
315 const float* input,
316 const float* taps,
317 unsigned int num_points)
318 {
319
320 2 unsigned int number = 0;
321 2 const unsigned int sixteenthPoints = num_points / 16;
322
323 2 float dotProduct = 0;
324 2 const float* aPtr = input;
325 2 const float* bPtr = taps;
326
327 __m256 a0Val, a1Val;
328 __m256 b0Val, b1Val;
329 __m256 c0Val, c1Val;
330
331 2 __m256 dotProdVal0 = _mm256_setzero_ps();
332 2 __m256 dotProdVal1 = _mm256_setzero_ps();
333
334
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
335
336 16382 a0Val = _mm256_loadu_ps(aPtr);
337 32764 a1Val = _mm256_loadu_ps(aPtr + 8);
338 16382 b0Val = _mm256_loadu_ps(bPtr);
339 32764 b1Val = _mm256_loadu_ps(bPtr + 8);
340
341 16382 c0Val = _mm256_mul_ps(a0Val, b0Val);
342 16382 c1Val = _mm256_mul_ps(a1Val, b1Val);
343
344 16382 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345 16382 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
346
347 16382 aPtr += 16;
348 16382 bPtr += 16;
349 }
350
351 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
352
353 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
354
355 _mm256_storeu_ps(dotProductVector,
356 dotProdVal0); // Store the results back into the dot product vector
357
358 2 dotProduct = dotProductVector[0];
359 2 dotProduct += dotProductVector[1];
360 2 dotProduct += dotProductVector[2];
361 2 dotProduct += dotProductVector[3];
362 2 dotProduct += dotProductVector[4];
363 2 dotProduct += dotProductVector[5];
364 2 dotProduct += dotProductVector[6];
365 2 dotProduct += dotProductVector[7];
366
367 2 number = sixteenthPoints * 16;
368
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
369 30 dotProduct += ((*aPtr++) * (*bPtr++));
370 }
371
372 2 *result = dotProduct;
373 2 }
374
375 #endif /*LV_HAVE_AVX*/
376
377 #if LV_HAVE_AVX2 && LV_HAVE_FMA
378 #include <immintrin.h>
379 2 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
380 const float* input,
381 const float* taps,
382 unsigned int num_points)
383 {
384 unsigned int number;
385 2 const unsigned int eighthPoints = num_points / 8;
386
387 2 const float* aPtr = input;
388 2 const float* bPtr = taps;
389
390 2 __m256 dotProdVal = _mm256_setzero_ps();
391 __m256 aVal1, bVal1;
392
393
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < eighthPoints; number++) {
394
395 32766 aVal1 = _mm256_loadu_ps(aPtr);
396 32766 bVal1 = _mm256_loadu_ps(bPtr);
397 32766 aPtr += 8;
398 32766 bPtr += 8;
399
400 32766 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
401 }
402
403 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
404 _mm256_storeu_ps(dotProductVector,
405 dotProdVal); // Store the results back into the dot product vector
406
407 2 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408 2 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409 2 dotProductVector[6] + dotProductVector[7];
410
411
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (number = eighthPoints * 8; number < num_points; number++) {
412 14 dotProduct += ((*aPtr++) * (*bPtr++));
413 }
414
415 2 *result = dotProduct;
416 2 }
417 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
418
419 #if LV_HAVE_AVX512F
420 #include <immintrin.h>
421 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
422 const float* input,
423 const float* taps,
424 unsigned int num_points)
425 {
426 unsigned int number;
427 const unsigned int sixteenthPoints = num_points / 16;
428
429 const float* aPtr = input;
430 const float* bPtr = taps;
431
432 __m512 dotProdVal = _mm512_setzero_ps();
433 __m512 aVal1, bVal1;
434
435 for (number = 0; number < sixteenthPoints; number++) {
436
437 aVal1 = _mm512_loadu_ps(aPtr);
438 bVal1 = _mm512_loadu_ps(bPtr);
439 aPtr += 16;
440 bPtr += 16;
441
442 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
443 }
444
445 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
446 _mm512_storeu_ps(dotProductVector,
447 dotProdVal); // Store the results back into the dot product vector
448
449 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453 dotProductVector[12] + dotProductVector[13] +
454 dotProductVector[14] + dotProductVector[15];
455
456 for (number = sixteenthPoints * 16; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
458 }
459
460 *result = dotProduct;
461 }
462 #endif /* LV_HAVE_AVX512F */
463
464 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
465
466 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
468
469 #include <stdio.h>
470 #include <volk/volk_common.h>
471
472
473 #ifdef LV_HAVE_GENERIC
474
475
476 2 static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
477 const float* input,
478 const float* taps,
479 unsigned int num_points)
480 {
481
482 2 float dotProduct = 0;
483 2 const float* aPtr = input;
484 2 const float* bPtr = taps;
485 2 unsigned int number = 0;
486
487
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
488 262142 dotProduct += ((*aPtr++) * (*bPtr++));
489 }
490
491 2 *result = dotProduct;
492 2 }
493
494 #endif /*LV_HAVE_GENERIC*/
495
496
497 #ifdef LV_HAVE_SSE
498
499
500 2 static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
501 const float* input,
502 const float* taps,
503 unsigned int num_points)
504 {
505
506 2 unsigned int number = 0;
507 2 const unsigned int sixteenthPoints = num_points / 16;
508
509 2 float dotProduct = 0;
510 2 const float* aPtr = input;
511 2 const float* bPtr = taps;
512
513 __m128 a0Val, a1Val, a2Val, a3Val;
514 __m128 b0Val, b1Val, b2Val, b3Val;
515 __m128 c0Val, c1Val, c2Val, c3Val;
516
517 2 __m128 dotProdVal0 = _mm_setzero_ps();
518 2 __m128 dotProdVal1 = _mm_setzero_ps();
519 2 __m128 dotProdVal2 = _mm_setzero_ps();
520 2 __m128 dotProdVal3 = _mm_setzero_ps();
521
522
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
523
524 16382 a0Val = _mm_load_ps(aPtr);
525 16382 a1Val = _mm_load_ps(aPtr + 4);
526 16382 a2Val = _mm_load_ps(aPtr + 8);
527 32764 a3Val = _mm_load_ps(aPtr + 12);
528 16382 b0Val = _mm_load_ps(bPtr);
529 16382 b1Val = _mm_load_ps(bPtr + 4);
530 16382 b2Val = _mm_load_ps(bPtr + 8);
531 32764 b3Val = _mm_load_ps(bPtr + 12);
532
533 16382 c0Val = _mm_mul_ps(a0Val, b0Val);
534 16382 c1Val = _mm_mul_ps(a1Val, b1Val);
535 16382 c2Val = _mm_mul_ps(a2Val, b2Val);
536 16382 c3Val = _mm_mul_ps(a3Val, b3Val);
537
538 16382 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
539 16382 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
540 16382 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
541 16382 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
542
543 16382 aPtr += 16;
544 16382 bPtr += 16;
545 }
546
547 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
548 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
549 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
550
551 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
552
553 _mm_store_ps(dotProductVector,
554 dotProdVal0); // Store the results back into the dot product vector
555
556 2 dotProduct = dotProductVector[0];
557 2 dotProduct += dotProductVector[1];
558 2 dotProduct += dotProductVector[2];
559 2 dotProduct += dotProductVector[3];
560
561 2 number = sixteenthPoints * 16;
562
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
563 30 dotProduct += ((*aPtr++) * (*bPtr++));
564 }
565
566 2 *result = dotProduct;
567 2 }
568
569 #endif /*LV_HAVE_SSE*/
570
571 #ifdef LV_HAVE_SSE3
572
573 #include <pmmintrin.h>
574
575 2 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
576 const float* input,
577 const float* taps,
578 unsigned int num_points)
579 {
580 2 unsigned int number = 0;
581 2 const unsigned int sixteenthPoints = num_points / 16;
582
583 2 float dotProduct = 0;
584 2 const float* aPtr = input;
585 2 const float* bPtr = taps;
586
587 __m128 a0Val, a1Val, a2Val, a3Val;
588 __m128 b0Val, b1Val, b2Val, b3Val;
589 __m128 c0Val, c1Val, c2Val, c3Val;
590
591 2 __m128 dotProdVal0 = _mm_setzero_ps();
592 2 __m128 dotProdVal1 = _mm_setzero_ps();
593 2 __m128 dotProdVal2 = _mm_setzero_ps();
594 2 __m128 dotProdVal3 = _mm_setzero_ps();
595
596
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
597
598 16382 a0Val = _mm_load_ps(aPtr);
599 16382 a1Val = _mm_load_ps(aPtr + 4);
600 16382 a2Val = _mm_load_ps(aPtr + 8);
601 32764 a3Val = _mm_load_ps(aPtr + 12);
602 16382 b0Val = _mm_load_ps(bPtr);
603 16382 b1Val = _mm_load_ps(bPtr + 4);
604 16382 b2Val = _mm_load_ps(bPtr + 8);
605 32764 b3Val = _mm_load_ps(bPtr + 12);
606
607 16382 c0Val = _mm_mul_ps(a0Val, b0Val);
608 16382 c1Val = _mm_mul_ps(a1Val, b1Val);
609 16382 c2Val = _mm_mul_ps(a2Val, b2Val);
610 16382 c3Val = _mm_mul_ps(a3Val, b3Val);
611
612 16382 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
613 16382 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
614 16382 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
615 16382 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
616
617 16382 aPtr += 16;
618 16382 bPtr += 16;
619 }
620
621 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
622 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
623 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
624
625 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
626 _mm_store_ps(dotProductVector,
627 dotProdVal0); // Store the results back into the dot product vector
628
629 2 dotProduct = dotProductVector[0];
630 2 dotProduct += dotProductVector[1];
631 2 dotProduct += dotProductVector[2];
632 2 dotProduct += dotProductVector[3];
633
634 2 number = sixteenthPoints * 16;
635
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
636 30 dotProduct += ((*aPtr++) * (*bPtr++));
637 }
638
639 2 *result = dotProduct;
640 2 }
641
642 #endif /*LV_HAVE_SSE3*/
643
644 #ifdef LV_HAVE_SSE4_1
645
646 #include <smmintrin.h>
647
648 2 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
649 const float* input,
650 const float* taps,
651 unsigned int num_points)
652 {
653 2 unsigned int number = 0;
654 2 const unsigned int sixteenthPoints = num_points / 16;
655
656 2 float dotProduct = 0;
657 2 const float* aPtr = input;
658 2 const float* bPtr = taps;
659
660 __m128 aVal1, bVal1, cVal1;
661 __m128 aVal2, bVal2, cVal2;
662 __m128 aVal3, bVal3, cVal3;
663 __m128 aVal4, bVal4, cVal4;
664
665 2 __m128 dotProdVal = _mm_setzero_ps();
666
667
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
668
669 16382 aVal1 = _mm_load_ps(aPtr);
670 16382 aPtr += 4;
671 16382 aVal2 = _mm_load_ps(aPtr);
672 16382 aPtr += 4;
673 16382 aVal3 = _mm_load_ps(aPtr);
674 16382 aPtr += 4;
675 16382 aVal4 = _mm_load_ps(aPtr);
676 16382 aPtr += 4;
677
678 16382 bVal1 = _mm_load_ps(bPtr);
679 16382 bPtr += 4;
680 16382 bVal2 = _mm_load_ps(bPtr);
681 16382 bPtr += 4;
682 16382 bVal3 = _mm_load_ps(bPtr);
683 16382 bPtr += 4;
684 16382 bVal4 = _mm_load_ps(bPtr);
685 16382 bPtr += 4;
686
687 16382 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
688 16382 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
689 16382 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
690 16382 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
691
692 16382 cVal1 = _mm_or_ps(cVal1, cVal2);
693 16382 cVal3 = _mm_or_ps(cVal3, cVal4);
694 16382 cVal1 = _mm_or_ps(cVal1, cVal3);
695
696 16382 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
697 }
698
699 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
700 _mm_store_ps(dotProductVector,
701 dotProdVal); // Store the results back into the dot product vector
702
703 2 dotProduct = dotProductVector[0];
704 2 dotProduct += dotProductVector[1];
705 2 dotProduct += dotProductVector[2];
706 2 dotProduct += dotProductVector[3];
707
708 2 number = sixteenthPoints * 16;
709
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
710 30 dotProduct += ((*aPtr++) * (*bPtr++));
711 }
712
713 2 *result = dotProduct;
714 2 }
715
716 #endif /*LV_HAVE_SSE4_1*/
717
718 #ifdef LV_HAVE_AVX
719
720 #include <immintrin.h>
721
722 2 static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
723 const float* input,
724 const float* taps,
725 unsigned int num_points)
726 {
727
728 2 unsigned int number = 0;
729 2 const unsigned int sixteenthPoints = num_points / 16;
730
731 2 float dotProduct = 0;
732 2 const float* aPtr = input;
733 2 const float* bPtr = taps;
734
735 __m256 a0Val, a1Val;
736 __m256 b0Val, b1Val;
737 __m256 c0Val, c1Val;
738
739 2 __m256 dotProdVal0 = _mm256_setzero_ps();
740 2 __m256 dotProdVal1 = _mm256_setzero_ps();
741
742
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
743
744 16382 a0Val = _mm256_load_ps(aPtr);
745 32764 a1Val = _mm256_load_ps(aPtr + 8);
746 16382 b0Val = _mm256_load_ps(bPtr);
747 32764 b1Val = _mm256_load_ps(bPtr + 8);
748
749 16382 c0Val = _mm256_mul_ps(a0Val, b0Val);
750 16382 c1Val = _mm256_mul_ps(a1Val, b1Val);
751
752 16382 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
753 16382 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
754
755 16382 aPtr += 16;
756 16382 bPtr += 16;
757 }
758
759 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
760
761 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
762
763 _mm256_store_ps(dotProductVector,
764 dotProdVal0); // Store the results back into the dot product vector
765
766 2 dotProduct = dotProductVector[0];
767 2 dotProduct += dotProductVector[1];
768 2 dotProduct += dotProductVector[2];
769 2 dotProduct += dotProductVector[3];
770 2 dotProduct += dotProductVector[4];
771 2 dotProduct += dotProductVector[5];
772 2 dotProduct += dotProductVector[6];
773 2 dotProduct += dotProductVector[7];
774
775 2 number = sixteenthPoints * 16;
776
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
777 30 dotProduct += ((*aPtr++) * (*bPtr++));
778 }
779
780 2 *result = dotProduct;
781 2 }
782 #endif /*LV_HAVE_AVX*/
783
784
785 #if LV_HAVE_AVX2 && LV_HAVE_FMA
786 #include <immintrin.h>
787 2 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
788 const float* input,
789 const float* taps,
790 unsigned int num_points)
791 {
792 unsigned int number;
793 2 const unsigned int eighthPoints = num_points / 8;
794
795 2 const float* aPtr = input;
796 2 const float* bPtr = taps;
797
798 2 __m256 dotProdVal = _mm256_setzero_ps();
799 __m256 aVal1, bVal1;
800
801
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (number = 0; number < eighthPoints; number++) {
802
803 32766 aVal1 = _mm256_load_ps(aPtr);
804 32766 bVal1 = _mm256_load_ps(bPtr);
805 32766 aPtr += 8;
806 32766 bPtr += 8;
807
808 32766 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
809 }
810
811 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
812 _mm256_store_ps(dotProductVector,
813 dotProdVal); // Store the results back into the dot product vector
814
815 2 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
816 2 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
817 2 dotProductVector[6] + dotProductVector[7];
818
819
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (number = eighthPoints * 8; number < num_points; number++) {
820 14 dotProduct += ((*aPtr++) * (*bPtr++));
821 }
822
823 2 *result = dotProduct;
824 2 }
825 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
826
827 #if LV_HAVE_AVX512F
828 #include <immintrin.h>
829 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
830 const float* input,
831 const float* taps,
832 unsigned int num_points)
833 {
834 unsigned int number;
835 const unsigned int sixteenthPoints = num_points / 16;
836
837 const float* aPtr = input;
838 const float* bPtr = taps;
839
840 __m512 dotProdVal = _mm512_setzero_ps();
841 __m512 aVal1, bVal1;
842
843 for (number = 0; number < sixteenthPoints; number++) {
844
845 aVal1 = _mm512_load_ps(aPtr);
846 bVal1 = _mm512_load_ps(bPtr);
847 aPtr += 16;
848 bPtr += 16;
849
850 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
851 }
852
853 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
854 _mm512_store_ps(dotProductVector,
855 dotProdVal); // Store the results back into the dot product vector
856
857 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
858 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
859 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
860 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
861 dotProductVector[12] + dotProductVector[13] +
862 dotProductVector[14] + dotProductVector[15];
863
864 for (number = sixteenthPoints * 16; number < num_points; number++) {
865 dotProduct += ((*aPtr++) * (*bPtr++));
866 }
867
868 *result = dotProduct;
869 }
870 #endif /* LV_HAVE_AVX512F */
871
872 #ifdef LV_HAVE_NEON
873 #include <arm_neon.h>
874
875 static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
876 const float* input,
877 const float* taps,
878 unsigned int num_points)
879 {
880
881 unsigned int quarter_points = num_points / 16;
882 float dotProduct = 0;
883 const float* aPtr = input;
884 const float* bPtr = taps;
885 unsigned int number = 0;
886
887 float32x4x4_t a_val, b_val, accumulator0;
888 accumulator0.val[0] = vdupq_n_f32(0);
889 accumulator0.val[1] = vdupq_n_f32(0);
890 accumulator0.val[2] = vdupq_n_f32(0);
891 accumulator0.val[3] = vdupq_n_f32(0);
892 // factor of 4 loop unroll with independent accumulators
893 // uses 12 out of 16 neon q registers
894 for (number = 0; number < quarter_points; ++number) {
895 a_val = vld4q_f32(aPtr);
896 b_val = vld4q_f32(bPtr);
897 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
898 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
899 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
900 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
901 aPtr += 16;
902 bPtr += 16;
903 }
904 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
905 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
906 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
907 __VOLK_ATTR_ALIGNED(32) float accumulator[4];
908 vst1q_f32(accumulator, accumulator0.val[0]);
909 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
910
911 for (number = quarter_points * 16; number < num_points; number++) {
912 dotProduct += ((*aPtr++) * (*bPtr++));
913 }
914
915 *result = dotProduct;
916 }
917
918 #endif
919
920
921 #ifdef LV_HAVE_NEON
922 static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
923 const float* input,
924 const float* taps,
925 unsigned int num_points)
926 {
927
928 unsigned int quarter_points = num_points / 8;
929 float dotProduct = 0;
930 const float* aPtr = input;
931 const float* bPtr = taps;
932 unsigned int number = 0;
933
934 float32x4x2_t a_val, b_val, accumulator_val;
935 accumulator_val.val[0] = vdupq_n_f32(0);
936 accumulator_val.val[1] = vdupq_n_f32(0);
937 // factor of 2 loop unroll with independent accumulators
938 for (number = 0; number < quarter_points; ++number) {
939 a_val = vld2q_f32(aPtr);
940 b_val = vld2q_f32(bPtr);
941 accumulator_val.val[0] =
942 vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
943 accumulator_val.val[1] =
944 vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
945 aPtr += 8;
946 bPtr += 8;
947 }
948 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
949 __VOLK_ATTR_ALIGNED(32) float accumulator[4];
950 vst1q_f32(accumulator, accumulator_val.val[0]);
951 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
952
953 for (number = quarter_points * 8; number < num_points; number++) {
954 dotProduct += ((*aPtr++) * (*bPtr++));
955 }
956
957 *result = dotProduct;
958 }
959
960 #endif /* LV_HAVE_NEON */
961
962 #ifdef LV_HAVE_NEONV7
963 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
964 const float* aVector,
965 const float* bVector,
966 unsigned int num_points);
967 #endif /* LV_HAVE_NEONV7 */
968
969 #ifdef LV_HAVE_NEONV7
970 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
971 const float* aVector,
972 const float* bVector,
973 unsigned int num_points);
974 #endif /* LV_HAVE_NEONV7 */
975
976 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
977