GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_32f_dot_prod_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 289 289 100.0%
Functions: 7 7 100.0%
Branches: 26 26 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_32f_dot_prod_32fc
12 *
13 * \b Overview
14 *
15 * This block computes the dot product (or inner product) between two
16 * vectors, the \p input and \p taps vectors. Given a set of \p
17 * num_points taps, the result is the sum of products between the two
18 * vectors. The result is a single value stored in the \p result
19 * address and will be complex.
20 *
21 * <b>Dispatcher Prototype</b>
22 * \code
23 * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float
24 * * taps, unsigned int num_points) \endcode
25 *
26 * \b Inputs
27 * \li input: vector of complex samples
28 * \li taps: floating point taps
29 * \li num_points: number of samples in both \p input and \p taps
30 *
31 * \b Outputs
32 * \li result: pointer to a complex value to hold the dot product result.
33 *
34 * \b Example
35 * \code
36 * int N = 10000;
37 * lv_32fc_t y;
38 * lv_32fc_t *x = (lv_32fc_t*)volk_malloc(N*sizeof(lv_32fc_t), volk_get_alignment());
39 * float *t = (float*)volk_malloc(N*sizeof(float), volk_get_alignment());
40 *
41 * <populate x and t with some values>
42 *
43 * volk_32fc_dot_prod_32fc(&y, x, t, N);
44 *
45 * volk_free(x);
46 * volk_free(t);
47 * \endcode
48 */
49
50 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52
53 #include <stdio.h>
54 #include <volk/volk_common.h>
55
56 #ifdef LV_HAVE_GENERIC
57
58 2 static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result,
59 const lv_32fc_t* input,
60 const float* taps,
61 unsigned int num_points)
62 {
63
64 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
65 2 const float* aPtr = (float*)input;
66 2 const float* bPtr = taps;
67 2 unsigned int number = 0;
68
69
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
70 262142 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
71 262142 aPtr += 2;
72 262142 bPtr += 1;
73 }
74
75 2 *result = returnValue;
76 2 }
77
78 #endif /*LV_HAVE_GENERIC*/
79
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81
82 #include <immintrin.h>
83
84 2 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
85 const lv_32fc_t* input,
86 const float* taps,
87 unsigned int num_points)
88 {
89
90 2 unsigned int number = 0;
91 2 const unsigned int sixteenthPoints = num_points / 16;
92
93 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
94 2 const float* aPtr = (float*)input;
95 2 const float* bPtr = taps;
96
97 __m256 a0Val, a1Val, a2Val, a3Val;
98 __m256 b0Val, b1Val, b2Val, b3Val;
99 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
100
101 2 __m256 dotProdVal0 = _mm256_setzero_ps();
102 2 __m256 dotProdVal1 = _mm256_setzero_ps();
103 2 __m256 dotProdVal2 = _mm256_setzero_ps();
104 2 __m256 dotProdVal3 = _mm256_setzero_ps();
105
106
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
107
108 16382 a0Val = _mm256_load_ps(aPtr);
109 16382 a1Val = _mm256_load_ps(aPtr + 8);
110 16382 a2Val = _mm256_load_ps(aPtr + 16);
111 32764 a3Val = _mm256_load_ps(aPtr + 24);
112
113 16382 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
114 32764 x1Val = _mm256_load_ps(bPtr + 8);
115 16382 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
116 16382 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
117 16382 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
118 16382 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
119
120 // TODO: it may be possible to rearrange swizzling to better pipeline data
121 16382 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
122 16382 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
123 16382 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
124 16382 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
125
126 16382 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
127 16382 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
128 16382 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
129 16382 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
130
131 16382 aPtr += 32;
132 16382 bPtr += 16;
133 }
134
135 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
136 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
137 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
138
139 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
140
141 _mm256_store_ps(dotProductVector,
142 dotProdVal0); // Store the results back into the dot product vector
143
144 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
145 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
146 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
147 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
148
149 2 number = sixteenthPoints * 16;
150
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
151 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
152 30 aPtr += 2;
153 30 bPtr += 1;
154 }
155
156 2 *result = returnValue;
157 2 }
158
159 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
160
161 #ifdef LV_HAVE_AVX
162
163 #include <immintrin.h>
164
165 2 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
166 const lv_32fc_t* input,
167 const float* taps,
168 unsigned int num_points)
169 {
170
171 2 unsigned int number = 0;
172 2 const unsigned int sixteenthPoints = num_points / 16;
173
174 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
175 2 const float* aPtr = (float*)input;
176 2 const float* bPtr = taps;
177
178 __m256 a0Val, a1Val, a2Val, a3Val;
179 __m256 b0Val, b1Val, b2Val, b3Val;
180 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
181 __m256 c0Val, c1Val, c2Val, c3Val;
182
183 2 __m256 dotProdVal0 = _mm256_setzero_ps();
184 2 __m256 dotProdVal1 = _mm256_setzero_ps();
185 2 __m256 dotProdVal2 = _mm256_setzero_ps();
186 2 __m256 dotProdVal3 = _mm256_setzero_ps();
187
188
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
189
190 16382 a0Val = _mm256_load_ps(aPtr);
191 16382 a1Val = _mm256_load_ps(aPtr + 8);
192 16382 a2Val = _mm256_load_ps(aPtr + 16);
193 32764 a3Val = _mm256_load_ps(aPtr + 24);
194
195 16382 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
196 32764 x1Val = _mm256_load_ps(bPtr + 8);
197 16382 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
198 16382 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
199 16382 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
200 16382 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
201
202 // TODO: it may be possible to rearrange swizzling to better pipeline data
203 16382 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
204 16382 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
205 16382 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
206 16382 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
207
208 16382 c0Val = _mm256_mul_ps(a0Val, b0Val);
209 16382 c1Val = _mm256_mul_ps(a1Val, b1Val);
210 16382 c2Val = _mm256_mul_ps(a2Val, b2Val);
211 16382 c3Val = _mm256_mul_ps(a3Val, b3Val);
212
213 16382 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
214 16382 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
215 16382 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
216 16382 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
217
218 16382 aPtr += 32;
219 16382 bPtr += 16;
220 }
221
222 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
223 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
224 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
225
226 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
227
228 _mm256_store_ps(dotProductVector,
229 dotProdVal0); // Store the results back into the dot product vector
230
231 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
232 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
233 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
234 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
235
236 2 number = sixteenthPoints * 16;
237
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
238 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
239 30 aPtr += 2;
240 30 bPtr += 1;
241 }
242
243 2 *result = returnValue;
244 2 }
245
246 #endif /*LV_HAVE_AVX*/
247
248
249 #ifdef LV_HAVE_SSE
250
251
252 2 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
253 const lv_32fc_t* input,
254 const float* taps,
255 unsigned int num_points)
256 {
257
258 2 unsigned int number = 0;
259 2 const unsigned int eighthPoints = num_points / 8;
260
261 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
262 2 const float* aPtr = (float*)input;
263 2 const float* bPtr = taps;
264
265 __m128 a0Val, a1Val, a2Val, a3Val;
266 __m128 b0Val, b1Val, b2Val, b3Val;
267 __m128 x0Val, x1Val, x2Val, x3Val;
268 __m128 c0Val, c1Val, c2Val, c3Val;
269
270 2 __m128 dotProdVal0 = _mm_setzero_ps();
271 2 __m128 dotProdVal1 = _mm_setzero_ps();
272 2 __m128 dotProdVal2 = _mm_setzero_ps();
273 2 __m128 dotProdVal3 = _mm_setzero_ps();
274
275
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
276
277 32766 a0Val = _mm_load_ps(aPtr);
278 32766 a1Val = _mm_load_ps(aPtr + 4);
279 32766 a2Val = _mm_load_ps(aPtr + 8);
280 65532 a3Val = _mm_load_ps(aPtr + 12);
281
282 32766 x0Val = _mm_load_ps(bPtr);
283 32766 x1Val = _mm_load_ps(bPtr);
284 32766 x2Val = _mm_load_ps(bPtr + 4);
285 65532 x3Val = _mm_load_ps(bPtr + 4);
286 32766 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
287 32766 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
288 32766 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
289 32766 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
290
291 32766 c0Val = _mm_mul_ps(a0Val, b0Val);
292 32766 c1Val = _mm_mul_ps(a1Val, b1Val);
293 32766 c2Val = _mm_mul_ps(a2Val, b2Val);
294 32766 c3Val = _mm_mul_ps(a3Val, b3Val);
295
296 32766 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
297 32766 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
298 32766 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
299 32766 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
300
301 32766 aPtr += 16;
302 32766 bPtr += 8;
303 }
304
305 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
306 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
307 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
308
309 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
310
311 _mm_store_ps(dotProductVector,
312 dotProdVal0); // Store the results back into the dot product vector
313
314 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
315 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
316
317 2 number = eighthPoints * 8;
318
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
319 14 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
320 14 aPtr += 2;
321 14 bPtr += 1;
322 }
323
324 2 *result = returnValue;
325 2 }
326
327 #endif /*LV_HAVE_SSE*/
328
329 #if LV_HAVE_AVX2 && LV_HAVE_FMA
330
331 #include <immintrin.h>
332
333 2 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
334 const lv_32fc_t* input,
335 const float* taps,
336 unsigned int num_points)
337 {
338
339 2 unsigned int number = 0;
340 2 const unsigned int sixteenthPoints = num_points / 16;
341
342 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
343 2 const float* aPtr = (float*)input;
344 2 const float* bPtr = taps;
345
346 __m256 a0Val, a1Val, a2Val, a3Val;
347 __m256 b0Val, b1Val, b2Val, b3Val;
348 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
349
350 2 __m256 dotProdVal0 = _mm256_setzero_ps();
351 2 __m256 dotProdVal1 = _mm256_setzero_ps();
352 2 __m256 dotProdVal2 = _mm256_setzero_ps();
353 2 __m256 dotProdVal3 = _mm256_setzero_ps();
354
355
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
356
357 16382 a0Val = _mm256_loadu_ps(aPtr);
358 16382 a1Val = _mm256_loadu_ps(aPtr + 8);
359 16382 a2Val = _mm256_loadu_ps(aPtr + 16);
360 32764 a3Val = _mm256_loadu_ps(aPtr + 24);
361
362 16382 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
363 32764 x1Val = _mm256_loadu_ps(bPtr + 8);
364 16382 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
365 16382 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
366 16382 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
367 16382 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
368
369 // TODO: it may be possible to rearrange swizzling to better pipeline data
370 16382 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
371 16382 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
372 16382 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
373 16382 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
374
375 16382 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
376 16382 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
377 16382 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
378 16382 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
379
380 16382 aPtr += 32;
381 16382 bPtr += 16;
382 }
383
384 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
385 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
386 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
387
388 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
389
390 _mm256_store_ps(dotProductVector,
391 dotProdVal0); // Store the results back into the dot product vector
392
393 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
394 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
395 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
396 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
397
398 2 number = sixteenthPoints * 16;
399
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
400 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
401 30 aPtr += 2;
402 30 bPtr += 1;
403 }
404
405 2 *result = returnValue;
406 2 }
407
408 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
409
410 #ifdef LV_HAVE_AVX
411
412 #include <immintrin.h>
413
414 2 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
415 const lv_32fc_t* input,
416 const float* taps,
417 unsigned int num_points)
418 {
419
420 2 unsigned int number = 0;
421 2 const unsigned int sixteenthPoints = num_points / 16;
422
423 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
424 2 const float* aPtr = (float*)input;
425 2 const float* bPtr = taps;
426
427 __m256 a0Val, a1Val, a2Val, a3Val;
428 __m256 b0Val, b1Val, b2Val, b3Val;
429 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
430 __m256 c0Val, c1Val, c2Val, c3Val;
431
432 2 __m256 dotProdVal0 = _mm256_setzero_ps();
433 2 __m256 dotProdVal1 = _mm256_setzero_ps();
434 2 __m256 dotProdVal2 = _mm256_setzero_ps();
435 2 __m256 dotProdVal3 = _mm256_setzero_ps();
436
437
2/2
✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
16384 for (; number < sixteenthPoints; number++) {
438
439 16382 a0Val = _mm256_loadu_ps(aPtr);
440 16382 a1Val = _mm256_loadu_ps(aPtr + 8);
441 16382 a2Val = _mm256_loadu_ps(aPtr + 16);
442 32764 a3Val = _mm256_loadu_ps(aPtr + 24);
443
444 16382 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
445 32764 x1Val = _mm256_loadu_ps(bPtr + 8);
446 16382 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
447 16382 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
448 16382 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
449 16382 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
450
451 // TODO: it may be possible to rearrange swizzling to better pipeline data
452 16382 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
453 16382 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
454 16382 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
455 16382 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
456
457 16382 c0Val = _mm256_mul_ps(a0Val, b0Val);
458 16382 c1Val = _mm256_mul_ps(a1Val, b1Val);
459 16382 c2Val = _mm256_mul_ps(a2Val, b2Val);
460 16382 c3Val = _mm256_mul_ps(a3Val, b3Val);
461
462 16382 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
463 16382 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
464 16382 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
465 16382 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
466
467 16382 aPtr += 32;
468 16382 bPtr += 16;
469 }
470
471 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
472 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
473 2 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
474
475 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
476
477 _mm256_store_ps(dotProductVector,
478 dotProdVal0); // Store the results back into the dot product vector
479
480 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
481 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
482 2 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
483 2 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
484
485 2 number = sixteenthPoints * 16;
486
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
32 for (; number < num_points; number++) {
487 30 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
488 30 aPtr += 2;
489 30 bPtr += 1;
490 }
491
492 2 *result = returnValue;
493 2 }
494 #endif /*LV_HAVE_AVX*/
495
496 #ifdef LV_HAVE_NEON
497 #include <arm_neon.h>
498
499 static inline void
500 volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result,
501 const lv_32fc_t* __restrict input,
502 const float* __restrict taps,
503 unsigned int num_points)
504 {
505
506 unsigned int number;
507 const unsigned int quarterPoints = num_points / 8;
508
509 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
510 const float* inputPtr = (float*)input;
511 const float* tapsPtr = taps;
512 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
513 float accVector_real[4];
514 float accVector_imag[4];
515
516 float32x4x2_t inputVector0, inputVector1;
517 float32x4_t tapsVector0, tapsVector1;
518 float32x4_t tmp_real0, tmp_imag0;
519 float32x4_t tmp_real1, tmp_imag1;
520 float32x4_t real_accumulator0, imag_accumulator0;
521 float32x4_t real_accumulator1, imag_accumulator1;
522
523 // zero out accumulators
524 // take a *float, return float32x4_t
525 real_accumulator0 = vld1q_f32(zero);
526 imag_accumulator0 = vld1q_f32(zero);
527 real_accumulator1 = vld1q_f32(zero);
528 imag_accumulator1 = vld1q_f32(zero);
529
530 for (number = 0; number < quarterPoints; number++) {
531 // load doublewords and duplicate in to second lane
532 tapsVector0 = vld1q_f32(tapsPtr);
533 tapsVector1 = vld1q_f32(tapsPtr + 4);
534
535 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
536 inputVector0 = vld2q_f32(inputPtr);
537 inputVector1 = vld2q_f32(inputPtr + 8);
538 // inputVector is now a struct of two vectors, 0th is real, 1st is imag
539
540 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
541 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
542
543 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
544 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
545
546 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
547 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
548
549 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
550 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
551
552 tapsPtr += 8;
553 inputPtr += 16;
554 }
555
556 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
557 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
558 // void vst1q_f32( float32_t * ptr, float32x4_t val);
559 // store results back to a complex (array of 2 floats)
560 vst1q_f32(accVector_real, real_accumulator0);
561 vst1q_f32(accVector_imag, imag_accumulator0);
562 returnValue += lv_cmake(
563 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
564 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
565
566 // clean up the remainder
567 for (number = quarterPoints * 8; number < num_points; number++) {
568 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
569 inputPtr += 2;
570 tapsPtr += 1;
571 }
572
573 *result = returnValue;
574 }
575
576 #endif /*LV_HAVE_NEON*/
577
578 #ifdef LV_HAVE_NEON
579 #include <arm_neon.h>
580
581 static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
582 const lv_32fc_t* __restrict input,
583 const float* __restrict taps,
584 unsigned int num_points)
585 {
586
587 unsigned int number;
588 const unsigned int quarterPoints = num_points / 4;
589
590 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
591 const float* inputPtr = (float*)input;
592 const float* tapsPtr = taps;
593 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
594 float accVector_real[4];
595 float accVector_imag[4];
596
597 float32x4x2_t inputVector;
598 float32x4_t tapsVector;
599 float32x4_t tmp_real, tmp_imag;
600 float32x4_t real_accumulator, imag_accumulator;
601
602
603 // zero out accumulators
604 // take a *float, return float32x4_t
605 real_accumulator = vld1q_f32(zero);
606 imag_accumulator = vld1q_f32(zero);
607
608 for (number = 0; number < quarterPoints; number++) {
609 // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
610 // load doublewords and duplicate in to second lane
611 tapsVector = vld1q_f32(tapsPtr);
612
613 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
614 inputVector = vld2q_f32(inputPtr);
615
616 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
617 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
618
619 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
620 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
621
622
623 tapsPtr += 4;
624 inputPtr += 8;
625 }
626
627 // store results back to a complex (array of 2 floats)
628 vst1q_f32(accVector_real, real_accumulator);
629 vst1q_f32(accVector_imag, imag_accumulator);
630 returnValue += lv_cmake(
631 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
632 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
633
634 // clean up the remainder
635 for (number = quarterPoints * 4; number < num_points; number++) {
636 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
637 inputPtr += 2;
638 tapsPtr += 1;
639 }
640
641 *result = returnValue;
642 }
643
644 #endif /*LV_HAVE_NEON*/
645
646 #ifdef LV_HAVE_NEONV7
647 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
648 const lv_32fc_t* input,
649 const float* taps,
650 unsigned int num_points);
651 #endif /*LV_HAVE_NEONV7*/
652
653 #ifdef LV_HAVE_NEONV7
654 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
655 const lv_32fc_t* input,
656 const float* taps,
657 unsigned int num_points);
658 #endif /*LV_HAVE_NEONV7*/
659
660 #ifdef LV_HAVE_NEONV7
661 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
662 const lv_32fc_t* input,
663 const float* taps,
664 unsigned int num_points);
665 #endif /*LV_HAVE_NEONV7*/
666
667 #ifdef LV_HAVE_SSE
668
669 2 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
670 const lv_32fc_t* input,
671 const float* taps,
672 unsigned int num_points)
673 {
674
675 2 unsigned int number = 0;
676 2 const unsigned int eighthPoints = num_points / 8;
677
678 2 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
679 2 const float* aPtr = (float*)input;
680 2 const float* bPtr = taps;
681
682 __m128 a0Val, a1Val, a2Val, a3Val;
683 __m128 b0Val, b1Val, b2Val, b3Val;
684 __m128 x0Val, x1Val, x2Val, x3Val;
685 __m128 c0Val, c1Val, c2Val, c3Val;
686
687 2 __m128 dotProdVal0 = _mm_setzero_ps();
688 2 __m128 dotProdVal1 = _mm_setzero_ps();
689 2 __m128 dotProdVal2 = _mm_setzero_ps();
690 2 __m128 dotProdVal3 = _mm_setzero_ps();
691
692
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
693
694 32766 a0Val = _mm_loadu_ps(aPtr);
695 32766 a1Val = _mm_loadu_ps(aPtr + 4);
696 32766 a2Val = _mm_loadu_ps(aPtr + 8);
697 65532 a3Val = _mm_loadu_ps(aPtr + 12);
698
699 32766 x0Val = _mm_loadu_ps(bPtr);
700 32766 x1Val = _mm_loadu_ps(bPtr);
701 32766 x2Val = _mm_loadu_ps(bPtr + 4);
702 65532 x3Val = _mm_loadu_ps(bPtr + 4);
703 32766 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
704 32766 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
705 32766 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
706 32766 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
707
708 32766 c0Val = _mm_mul_ps(a0Val, b0Val);
709 32766 c1Val = _mm_mul_ps(a1Val, b1Val);
710 32766 c2Val = _mm_mul_ps(a2Val, b2Val);
711 32766 c3Val = _mm_mul_ps(a3Val, b3Val);
712
713 32766 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
714 32766 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
715 32766 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
716 32766 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
717
718 32766 aPtr += 16;
719 32766 bPtr += 8;
720 }
721
722 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
723 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
724 2 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
725
726 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
727
728 _mm_store_ps(dotProductVector,
729 dotProdVal0); // Store the results back into the dot product vector
730
731 2 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
732 2 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
733
734 2 number = eighthPoints * 8;
735
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
736 14 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
737 14 aPtr += 2;
738 14 bPtr += 1;
739 }
740
741 2 *result = returnValue;
742 2 }
743
744 #endif /*LV_HAVE_SSE*/
745
746
747 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
748