GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_dot_prod_16i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 263 361 72.9%
Functions: 7 9 77.8%
Branches: 26 34 76.5%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_dot_prod_16i
12 *
13 * \b Overview
14 *
15 * This block computes the dot product (or inner product) between two
16 * vectors, the \p input and \p taps vectors. Given a set of \p
17 * num_points taps, the result is the sum of products between the two
18 * vectors. The result is a single value stored in the \p result
19 * address and is conerted to a fixed-point short.
20 *
21 * <b>Dispatcher Prototype</b>
22 * \code
23 * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps,
24 * unsigned int num_points) \endcode
25 *
26 * \b Inputs
27 * \li input: vector of floats.
28 * \li taps: float taps.
29 * \li num_points: number of samples in both \p input and \p taps.
30 *
31 * \b Outputs
32 * \li result: pointer to a short value to hold the dot product result.
33 *
34 * \b Example
35 * \code
36 * int N = 10000;
37 *
38 * <FIXME>
39 *
40 * volk_32f_x2_dot_prod_16i();
41 *
42 * \endcode
43 */
44
45 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
46 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
47
48 #include <stdio.h>
49 #include <volk/volk_common.h>
50
51
52 #ifdef LV_HAVE_GENERIC
53
54
55 2 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
56 const float* input,
57 const float* taps,
58 unsigned int num_points)
59 {
60
61 2 float dotProduct = 0;
62 2 const float* aPtr = input;
63 2 const float* bPtr = taps;
64 2 unsigned int number = 0;
65
66
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
67 262142 dotProduct += ((*aPtr++) * (*bPtr++));
68 }
69
70 2 *result = (int16_t)dotProduct;
71 2 }
72
73 #endif /*LV_HAVE_GENERIC*/
74
75
76 #ifdef LV_HAVE_SSE
77
78 2 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
79 const float* input,
80 const float* taps,
81 unsigned int num_points)
82 {
83
84 2 unsigned int number = 0;
85 2 const unsigned int sixteenthPoints = num_points / 16;
86
87 2 float dotProduct = 0;
88 2 const float* aPtr = input;
89 2 const float* bPtr = taps;
90
91 __m128 a0Val, a1Val, a2Val, a3Val;
92 __m128 b0Val, b1Val, b2Val, b3Val;
93 __m128 c0Val, c1Val, c2Val, c3Val;
94
95 2 __m128 dotProdVal0 = _mm_setzero_ps();
96 2 __m128 dotProdVal1 = _mm_setzero_ps();
97 2 __m128 dotProdVal2 = _mm_setzero_ps();
98 2 __m128 dotProdVal3 = _mm_setzero_ps();
99
100
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
101
102 16382 a0Val = _mm_load_ps(aPtr);
103 16382 a1Val = _mm_load_ps(aPtr + 4);
104 16382 a2Val = _mm_load_ps(aPtr + 8);
105 32764 a3Val = _mm_load_ps(aPtr + 12);
106 16382 b0Val = _mm_load_ps(bPtr);
107 16382 b1Val = _mm_load_ps(bPtr + 4);
108 16382 b2Val = _mm_load_ps(bPtr + 8);
109 32764 b3Val = _mm_load_ps(bPtr + 12);
110
111 16382 c0Val = _mm_mul_ps(a0Val, b0Val);
112 16382 c1Val = _mm_mul_ps(a1Val, b1Val);
113 16382 c2Val = _mm_mul_ps(a2Val, b2Val);
114 16382 c3Val = _mm_mul_ps(a3Val, b3Val);
115
116 16382 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
117 16382 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
118 16382 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
119 16382 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
120
121 16382 aPtr += 16;
122 16382 bPtr += 16;
123 }
124
125 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
126 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
127 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
128
129 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
130
131 _mm_store_ps(dotProductVector,
132 dotProdVal0); // Store the results back into the dot product vector
133
134 2 dotProduct = dotProductVector[0];
135 2 dotProduct += dotProductVector[1];
136 2 dotProduct += dotProductVector[2];
137 2 dotProduct += dotProductVector[3];
138
139 2 number = sixteenthPoints * 16;
140
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
141 30 dotProduct += ((*aPtr++) * (*bPtr++));
142 }
143
144 2 *result = (short)dotProduct;
145 2 }
146
147 #endif /*LV_HAVE_SSE*/
148
149
150 #if LV_HAVE_AVX2 && LV_HAVE_FMA
151
152 2 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
153 const float* input,
154 const float* taps,
155 unsigned int num_points)
156 {
157
158 2 unsigned int number = 0;
159 2 const unsigned int thirtysecondPoints = num_points / 32;
160
161 2 float dotProduct = 0;
162 2 const float* aPtr = input;
163 2 const float* bPtr = taps;
164
165 __m256 a0Val, a1Val, a2Val, a3Val;
166 __m256 b0Val, b1Val, b2Val, b3Val;
167
168 2 __m256 dotProdVal0 = _mm256_setzero_ps();
169 2 __m256 dotProdVal1 = _mm256_setzero_ps();
170 2 __m256 dotProdVal2 = _mm256_setzero_ps();
171 2 __m256 dotProdVal3 = _mm256_setzero_ps();
172
173
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (; number < thirtysecondPoints; number++) {
174
175 8190 a0Val = _mm256_load_ps(aPtr);
176 8190 a1Val = _mm256_load_ps(aPtr + 8);
177 8190 a2Val = _mm256_load_ps(aPtr + 16);
178 16380 a3Val = _mm256_load_ps(aPtr + 24);
179 8190 b0Val = _mm256_load_ps(bPtr);
180 8190 b1Val = _mm256_load_ps(bPtr + 8);
181 8190 b2Val = _mm256_load_ps(bPtr + 16);
182 16380 b3Val = _mm256_load_ps(bPtr + 24);
183
184 8190 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
185 8190 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
186 8190 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
187 8190 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
188
189 8190 aPtr += 32;
190 8190 bPtr += 32;
191 }
192
193 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
194 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
195 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
196
197 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
198
199 _mm256_store_ps(dotProductVector,
200 dotProdVal0); // Store the results back into the dot product vector
201
202 2 dotProduct = dotProductVector[0];
203 2 dotProduct += dotProductVector[1];
204 2 dotProduct += dotProductVector[2];
205 2 dotProduct += dotProductVector[3];
206 2 dotProduct += dotProductVector[4];
207 2 dotProduct += dotProductVector[5];
208 2 dotProduct += dotProductVector[6];
209 2 dotProduct += dotProductVector[7];
210
211 2 number = thirtysecondPoints * 32;
212
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
213 62 dotProduct += ((*aPtr++) * (*bPtr++));
214 }
215
216 2 *result = (short)dotProduct;
217 2 }
218
219 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
220
221
222 #ifdef LV_HAVE_AVX
223
224 2 static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
225 const float* input,
226 const float* taps,
227 unsigned int num_points)
228 {
229
230 2 unsigned int number = 0;
231 2 const unsigned int thirtysecondPoints = num_points / 32;
232
233 2 float dotProduct = 0;
234 2 const float* aPtr = input;
235 2 const float* bPtr = taps;
236
237 __m256 a0Val, a1Val, a2Val, a3Val;
238 __m256 b0Val, b1Val, b2Val, b3Val;
239 __m256 c0Val, c1Val, c2Val, c3Val;
240
241 2 __m256 dotProdVal0 = _mm256_setzero_ps();
242 2 __m256 dotProdVal1 = _mm256_setzero_ps();
243 2 __m256 dotProdVal2 = _mm256_setzero_ps();
244 2 __m256 dotProdVal3 = _mm256_setzero_ps();
245
246
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (; number < thirtysecondPoints; number++) {
247
248 8190 a0Val = _mm256_load_ps(aPtr);
249 8190 a1Val = _mm256_load_ps(aPtr + 8);
250 8190 a2Val = _mm256_load_ps(aPtr + 16);
251 16380 a3Val = _mm256_load_ps(aPtr + 24);
252 8190 b0Val = _mm256_load_ps(bPtr);
253 8190 b1Val = _mm256_load_ps(bPtr + 8);
254 8190 b2Val = _mm256_load_ps(bPtr + 16);
255 16380 b3Val = _mm256_load_ps(bPtr + 24);
256
257 8190 c0Val = _mm256_mul_ps(a0Val, b0Val);
258 8190 c1Val = _mm256_mul_ps(a1Val, b1Val);
259 8190 c2Val = _mm256_mul_ps(a2Val, b2Val);
260 8190 c3Val = _mm256_mul_ps(a3Val, b3Val);
261
262 8190 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
263 8190 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
264 8190 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
265 8190 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
266
267 8190 aPtr += 32;
268 8190 bPtr += 32;
269 }
270
271 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
272 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
273 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
274
275 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
276
277 _mm256_store_ps(dotProductVector,
278 dotProdVal0); // Store the results back into the dot product vector
279
280 2 dotProduct = dotProductVector[0];
281 2 dotProduct += dotProductVector[1];
282 2 dotProduct += dotProductVector[2];
283 2 dotProduct += dotProductVector[3];
284 2 dotProduct += dotProductVector[4];
285 2 dotProduct += dotProductVector[5];
286 2 dotProduct += dotProductVector[6];
287 2 dotProduct += dotProductVector[7];
288
289 2 number = thirtysecondPoints * 32;
290
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
291 62 dotProduct += ((*aPtr++) * (*bPtr++));
292 }
293
294 2 *result = (short)dotProduct;
295 2 }
296
297 #endif /*LV_HAVE_AVX*/
298
299 #ifdef LV_HAVE_AVX512F
300
301 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
302 const float* input,
303 const float* taps,
304 unsigned int num_points)
305 {
306
307 unsigned int number = 0;
308 const unsigned int sixtyfourthPoints = num_points / 64;
309
310 float dotProduct = 0;
311 const float* aPtr = input;
312 const float* bPtr = taps;
313
314 __m512 a0Val, a1Val, a2Val, a3Val;
315 __m512 b0Val, b1Val, b2Val, b3Val;
316
317 __m512 dotProdVal0 = _mm512_setzero_ps();
318 __m512 dotProdVal1 = _mm512_setzero_ps();
319 __m512 dotProdVal2 = _mm512_setzero_ps();
320 __m512 dotProdVal3 = _mm512_setzero_ps();
321
322 for (; number < sixtyfourthPoints; number++) {
323
324 a0Val = _mm512_load_ps(aPtr);
325 a1Val = _mm512_load_ps(aPtr + 16);
326 a2Val = _mm512_load_ps(aPtr + 32);
327 a3Val = _mm512_load_ps(aPtr + 48);
328 b0Val = _mm512_load_ps(bPtr);
329 b1Val = _mm512_load_ps(bPtr + 16);
330 b2Val = _mm512_load_ps(bPtr + 32);
331 b3Val = _mm512_load_ps(bPtr + 48);
332
333 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
334 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
335 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
336 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
337
338 aPtr += 64;
339 bPtr += 64;
340 }
341
342 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
343 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
344 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
345
346 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
347
348 _mm512_store_ps(dotProductVector,
349 dotProdVal0); // Store the results back into the dot product vector
350
351 dotProduct = dotProductVector[0];
352 dotProduct += dotProductVector[1];
353 dotProduct += dotProductVector[2];
354 dotProduct += dotProductVector[3];
355 dotProduct += dotProductVector[4];
356 dotProduct += dotProductVector[5];
357 dotProduct += dotProductVector[6];
358 dotProduct += dotProductVector[7];
359 dotProduct += dotProductVector[8];
360 dotProduct += dotProductVector[9];
361 dotProduct += dotProductVector[10];
362 dotProduct += dotProductVector[11];
363 dotProduct += dotProductVector[12];
364 dotProduct += dotProductVector[13];
365 dotProduct += dotProductVector[14];
366 dotProduct += dotProductVector[15];
367
368 number = sixtyfourthPoints * 64;
369 for (; number < num_points; number++) {
370 dotProduct += ((*aPtr++) * (*bPtr++));
371 }
372
373 *result = (short)dotProduct;
374 }
375
376 #endif /*LV_HAVE_AVX512F*/
377
378
379 #ifdef LV_HAVE_SSE
380
381 2 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
382 const float* input,
383 const float* taps,
384 unsigned int num_points)
385 {
386
387 2 unsigned int number = 0;
388 2 const unsigned int sixteenthPoints = num_points / 16;
389
390 2 float dotProduct = 0;
391 2 const float* aPtr = input;
392 2 const float* bPtr = taps;
393
394 __m128 a0Val, a1Val, a2Val, a3Val;
395 __m128 b0Val, b1Val, b2Val, b3Val;
396 __m128 c0Val, c1Val, c2Val, c3Val;
397
398 2 __m128 dotProdVal0 = _mm_setzero_ps();
399 2 __m128 dotProdVal1 = _mm_setzero_ps();
400 2 __m128 dotProdVal2 = _mm_setzero_ps();
401 2 __m128 dotProdVal3 = _mm_setzero_ps();
402
403
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
404
405 16382 a0Val = _mm_loadu_ps(aPtr);
406 16382 a1Val = _mm_loadu_ps(aPtr + 4);
407 16382 a2Val = _mm_loadu_ps(aPtr + 8);
408 32764 a3Val = _mm_loadu_ps(aPtr + 12);
409 16382 b0Val = _mm_loadu_ps(bPtr);
410 16382 b1Val = _mm_loadu_ps(bPtr + 4);
411 16382 b2Val = _mm_loadu_ps(bPtr + 8);
412 32764 b3Val = _mm_loadu_ps(bPtr + 12);
413
414 16382 c0Val = _mm_mul_ps(a0Val, b0Val);
415 16382 c1Val = _mm_mul_ps(a1Val, b1Val);
416 16382 c2Val = _mm_mul_ps(a2Val, b2Val);
417 16382 c3Val = _mm_mul_ps(a3Val, b3Val);
418
419 16382 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
420 16382 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
421 16382 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
422 16382 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
423
424 16382 aPtr += 16;
425 16382 bPtr += 16;
426 }
427
428 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
429 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
430 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
431
432 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
433
434 _mm_store_ps(dotProductVector,
435 dotProdVal0); // Store the results back into the dot product vector
436
437 2 dotProduct = dotProductVector[0];
438 2 dotProduct += dotProductVector[1];
439 2 dotProduct += dotProductVector[2];
440 2 dotProduct += dotProductVector[3];
441
442 2 number = sixteenthPoints * 16;
443
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
444 30 dotProduct += ((*aPtr++) * (*bPtr++));
445 }
446
447 2 *result = (short)dotProduct;
448 2 }
449
450 #endif /*LV_HAVE_SSE*/
451
452
453 #if LV_HAVE_AVX2 && LV_HAVE_FMA
454
455 2 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
456 const float* input,
457 const float* taps,
458 unsigned int num_points)
459 {
460
461 2 unsigned int number = 0;
462 2 const unsigned int thirtysecondPoints = num_points / 32;
463
464 2 float dotProduct = 0;
465 2 const float* aPtr = input;
466 2 const float* bPtr = taps;
467
468 __m256 a0Val, a1Val, a2Val, a3Val;
469 __m256 b0Val, b1Val, b2Val, b3Val;
470
471 2 __m256 dotProdVal0 = _mm256_setzero_ps();
472 2 __m256 dotProdVal1 = _mm256_setzero_ps();
473 2 __m256 dotProdVal2 = _mm256_setzero_ps();
474 2 __m256 dotProdVal3 = _mm256_setzero_ps();
475
476
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (; number < thirtysecondPoints; number++) {
477
478 8190 a0Val = _mm256_loadu_ps(aPtr);
479 8190 a1Val = _mm256_loadu_ps(aPtr + 8);
480 8190 a2Val = _mm256_loadu_ps(aPtr + 16);
481 16380 a3Val = _mm256_loadu_ps(aPtr + 24);
482 8190 b0Val = _mm256_loadu_ps(bPtr);
483 8190 b1Val = _mm256_loadu_ps(bPtr + 8);
484 8190 b2Val = _mm256_loadu_ps(bPtr + 16);
485 16380 b3Val = _mm256_loadu_ps(bPtr + 24);
486
487 8190 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
488 8190 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
489 8190 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
490 8190 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
491
492 8190 aPtr += 32;
493 8190 bPtr += 32;
494 }
495
496 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
497 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
498 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
499
500 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
501
502 _mm256_store_ps(dotProductVector,
503 dotProdVal0); // Store the results back into the dot product vector
504
505 2 dotProduct = dotProductVector[0];
506 2 dotProduct += dotProductVector[1];
507 2 dotProduct += dotProductVector[2];
508 2 dotProduct += dotProductVector[3];
509 2 dotProduct += dotProductVector[4];
510 2 dotProduct += dotProductVector[5];
511 2 dotProduct += dotProductVector[6];
512 2 dotProduct += dotProductVector[7];
513
514 2 number = thirtysecondPoints * 32;
515
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
516 62 dotProduct += ((*aPtr++) * (*bPtr++));
517 }
518
519 2 *result = (short)dotProduct;
520 2 }
521
522 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
523
524
525 #ifdef LV_HAVE_AVX
526
527 2 static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
528 const float* input,
529 const float* taps,
530 unsigned int num_points)
531 {
532
533 2 unsigned int number = 0;
534 2 const unsigned int thirtysecondPoints = num_points / 32;
535
536 2 float dotProduct = 0;
537 2 const float* aPtr = input;
538 2 const float* bPtr = taps;
539
540 __m256 a0Val, a1Val, a2Val, a3Val;
541 __m256 b0Val, b1Val, b2Val, b3Val;
542 __m256 c0Val, c1Val, c2Val, c3Val;
543
544 2 __m256 dotProdVal0 = _mm256_setzero_ps();
545 2 __m256 dotProdVal1 = _mm256_setzero_ps();
546 2 __m256 dotProdVal2 = _mm256_setzero_ps();
547 2 __m256 dotProdVal3 = _mm256_setzero_ps();
548
549
2/2
✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
8192 for (; number < thirtysecondPoints; number++) {
550
551 8190 a0Val = _mm256_loadu_ps(aPtr);
552 8190 a1Val = _mm256_loadu_ps(aPtr + 8);
553 8190 a2Val = _mm256_loadu_ps(aPtr + 16);
554 16380 a3Val = _mm256_loadu_ps(aPtr + 24);
555 8190 b0Val = _mm256_loadu_ps(bPtr);
556 8190 b1Val = _mm256_loadu_ps(bPtr + 8);
557 8190 b2Val = _mm256_loadu_ps(bPtr + 16);
558 16380 b3Val = _mm256_loadu_ps(bPtr + 24);
559
560 8190 c0Val = _mm256_mul_ps(a0Val, b0Val);
561 8190 c1Val = _mm256_mul_ps(a1Val, b1Val);
562 8190 c2Val = _mm256_mul_ps(a2Val, b2Val);
563 8190 c3Val = _mm256_mul_ps(a3Val, b3Val);
564
565 8190 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
566 8190 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
567 8190 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
568 8190 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
569
570 8190 aPtr += 32;
571 8190 bPtr += 32;
572 }
573
574 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
575 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
576 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
577
578 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
579
580 _mm256_store_ps(dotProductVector,
581 dotProdVal0); // Store the results back into the dot product vector
582
583 2 dotProduct = dotProductVector[0];
584 2 dotProduct += dotProductVector[1];
585 2 dotProduct += dotProductVector[2];
586 2 dotProduct += dotProductVector[3];
587 2 dotProduct += dotProductVector[4];
588 2 dotProduct += dotProductVector[5];
589 2 dotProduct += dotProductVector[6];
590 2 dotProduct += dotProductVector[7];
591
592 2 number = thirtysecondPoints * 32;
593
2/2
✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
64 for (; number < num_points; number++) {
594 62 dotProduct += ((*aPtr++) * (*bPtr++));
595 }
596
597 2 *result = (short)dotProduct;
598 2 }
599
600 #endif /*LV_HAVE_AVX*/
601
602 #ifdef LV_HAVE_AVX512F
603
604 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
605 const float* input,
606 const float* taps,
607 unsigned int num_points)
608 {
609
610 unsigned int number = 0;
611 const unsigned int sixtyfourthPoints = num_points / 64;
612
613 float dotProduct = 0;
614 const float* aPtr = input;
615 const float* bPtr = taps;
616
617 __m512 a0Val, a1Val, a2Val, a3Val;
618 __m512 b0Val, b1Val, b2Val, b3Val;
619
620 __m512 dotProdVal0 = _mm512_setzero_ps();
621 __m512 dotProdVal1 = _mm512_setzero_ps();
622 __m512 dotProdVal2 = _mm512_setzero_ps();
623 __m512 dotProdVal3 = _mm512_setzero_ps();
624
625 for (; number < sixtyfourthPoints; number++) {
626
627 a0Val = _mm512_loadu_ps(aPtr);
628 a1Val = _mm512_loadu_ps(aPtr + 16);
629 a2Val = _mm512_loadu_ps(aPtr + 32);
630 a3Val = _mm512_loadu_ps(aPtr + 48);
631 b0Val = _mm512_loadu_ps(bPtr);
632 b1Val = _mm512_loadu_ps(bPtr + 16);
633 b2Val = _mm512_loadu_ps(bPtr + 32);
634 b3Val = _mm512_loadu_ps(bPtr + 48);
635
636 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
637 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
638 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
639 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
640
641 aPtr += 64;
642 bPtr += 64;
643 }
644
645 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
646 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
647 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
648
649 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
650
651 _mm512_storeu_ps(dotProductVector,
652 dotProdVal0); // Store the results back into the dot product vector
653
654 dotProduct = dotProductVector[0];
655 dotProduct += dotProductVector[1];
656 dotProduct += dotProductVector[2];
657 dotProduct += dotProductVector[3];
658 dotProduct += dotProductVector[4];
659 dotProduct += dotProductVector[5];
660 dotProduct += dotProductVector[6];
661 dotProduct += dotProductVector[7];
662 dotProduct += dotProductVector[8];
663 dotProduct += dotProductVector[9];
664 dotProduct += dotProductVector[10];
665 dotProduct += dotProductVector[11];
666 dotProduct += dotProductVector[12];
667 dotProduct += dotProductVector[13];
668 dotProduct += dotProductVector[14];
669 dotProduct += dotProductVector[15];
670
671 number = sixtyfourthPoints * 64;
672 for (; number < num_points; number++) {
673 dotProduct += ((*aPtr++) * (*bPtr++));
674 }
675
676 *result = (short)dotProduct;
677 }
678
679 #endif /*LV_HAVE_AVX512F*/
680
681
682 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
683