Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_x2_dot_prod_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * This block computes the dot product (or inner product) between two | ||
16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
17 | * num_points taps, the result is the sum of products between the two | ||
18 | * vectors. The result is a single value stored in the \p result | ||
19 | * address and is returned as a float. | ||
20 | * | ||
21 | * <b>Dispatcher Prototype</b> | ||
22 | * \code | ||
23 | * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, | ||
24 | * unsigned int num_points) \endcode | ||
25 | * | ||
26 | * \b Inputs | ||
27 | * \li input: vector of floats. | ||
28 | * \li taps: float taps. | ||
29 | * \li num_points: number of samples in both \p input and \p taps. | ||
30 | * | ||
31 | * \b Outputs | ||
32 | * \li result: pointer to a float value to hold the dot product result. | ||
33 | * | ||
34 | * \b Example | ||
35 | * Take the dot product of an increasing vector and a vector of ones. The result is the | ||
36 | * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment(); | ||
37 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
38 | * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
39 | * float* out = (float*)volk_malloc(sizeof(float)*1, alignment); | ||
40 | * | ||
41 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
42 | * increasing[ii] = (float)ii; | ||
43 | * ones[ii] = 1.f; | ||
44 | * } | ||
45 | * | ||
46 | * volk_32f_x2_dot_prod_32f(out, increasing, ones, N); | ||
47 | * | ||
48 | * printf("out = %1.2f\n", *out); | ||
49 | * | ||
50 | * volk_free(increasing); | ||
51 | * volk_free(ones); | ||
52 | * volk_free(out); | ||
53 | * | ||
54 | * return 0; | ||
55 | * \endcode | ||
56 | */ | ||
57 | |||
58 | #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H | ||
59 | #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H | ||
60 | |||
61 | #include <stdio.h> | ||
62 | #include <volk/volk_common.h> | ||
63 | |||
64 | |||
65 | #ifdef LV_HAVE_GENERIC | ||
66 | |||
67 | |||
68 | 2 | static inline void volk_32f_x2_dot_prod_32f_generic(float* result, | |
69 | const float* input, | ||
70 | const float* taps, | ||
71 | unsigned int num_points) | ||
72 | { | ||
73 | |||
74 | 2 | float dotProduct = 0; | |
75 | 2 | const float* aPtr = input; | |
76 | 2 | const float* bPtr = taps; | |
77 | 2 | unsigned int number = 0; | |
78 | |||
79 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
80 | 262142 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
81 | } | ||
82 | |||
83 | 2 | *result = dotProduct; | |
84 | 2 | } | |
85 | |||
86 | #endif /*LV_HAVE_GENERIC*/ | ||
87 | |||
88 | |||
89 | #ifdef LV_HAVE_SSE | ||
90 | |||
91 | |||
92 | 2 | static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result, | |
93 | const float* input, | ||
94 | const float* taps, | ||
95 | unsigned int num_points) | ||
96 | { | ||
97 | |||
98 | 2 | unsigned int number = 0; | |
99 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
100 | |||
101 | 2 | float dotProduct = 0; | |
102 | 2 | const float* aPtr = input; | |
103 | 2 | const float* bPtr = taps; | |
104 | |||
105 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
106 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
107 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
108 | |||
109 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
110 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
111 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
112 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
113 | |||
114 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
115 | |||
116 | 16382 | a0Val = _mm_loadu_ps(aPtr); | |
117 | 16382 | a1Val = _mm_loadu_ps(aPtr + 4); | |
118 | 16382 | a2Val = _mm_loadu_ps(aPtr + 8); | |
119 | 32764 | a3Val = _mm_loadu_ps(aPtr + 12); | |
120 | 16382 | b0Val = _mm_loadu_ps(bPtr); | |
121 | 16382 | b1Val = _mm_loadu_ps(bPtr + 4); | |
122 | 16382 | b2Val = _mm_loadu_ps(bPtr + 8); | |
123 | 32764 | b3Val = _mm_loadu_ps(bPtr + 12); | |
124 | |||
125 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
126 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
127 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
128 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
129 | |||
130 | 16382 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
131 | 16382 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
132 | 16382 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
133 | 16382 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
134 | |||
135 | 16382 | aPtr += 16; | |
136 | 16382 | bPtr += 16; | |
137 | } | ||
138 | |||
139 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
140 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
141 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
142 | |||
143 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
144 | |||
145 | _mm_store_ps(dotProductVector, | ||
146 | dotProdVal0); // Store the results back into the dot product vector | ||
147 | |||
148 | 2 | dotProduct = dotProductVector[0]; | |
149 | 2 | dotProduct += dotProductVector[1]; | |
150 | 2 | dotProduct += dotProductVector[2]; | |
151 | 2 | dotProduct += dotProductVector[3]; | |
152 | |||
153 | 2 | number = sixteenthPoints * 16; | |
154 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
155 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
156 | } | ||
157 | |||
158 | 2 | *result = dotProduct; | |
159 | 2 | } | |
160 | |||
161 | #endif /*LV_HAVE_SSE*/ | ||
162 | |||
163 | #ifdef LV_HAVE_SSE3 | ||
164 | |||
165 | #include <pmmintrin.h> | ||
166 | |||
167 | 2 | static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result, | |
168 | const float* input, | ||
169 | const float* taps, | ||
170 | unsigned int num_points) | ||
171 | { | ||
172 | 2 | unsigned int number = 0; | |
173 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
174 | |||
175 | 2 | float dotProduct = 0; | |
176 | 2 | const float* aPtr = input; | |
177 | 2 | const float* bPtr = taps; | |
178 | |||
179 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
180 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
181 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
182 | |||
183 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
184 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
185 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
186 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
187 | |||
188 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
189 | |||
190 | 16382 | a0Val = _mm_loadu_ps(aPtr); | |
191 | 16382 | a1Val = _mm_loadu_ps(aPtr + 4); | |
192 | 16382 | a2Val = _mm_loadu_ps(aPtr + 8); | |
193 | 32764 | a3Val = _mm_loadu_ps(aPtr + 12); | |
194 | 16382 | b0Val = _mm_loadu_ps(bPtr); | |
195 | 16382 | b1Val = _mm_loadu_ps(bPtr + 4); | |
196 | 16382 | b2Val = _mm_loadu_ps(bPtr + 8); | |
197 | 32764 | b3Val = _mm_loadu_ps(bPtr + 12); | |
198 | |||
199 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
200 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
201 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
202 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
203 | |||
204 | 16382 | dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); | |
205 | 16382 | dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); | |
206 | 16382 | dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); | |
207 | 16382 | dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); | |
208 | |||
209 | 16382 | aPtr += 16; | |
210 | 16382 | bPtr += 16; | |
211 | } | ||
212 | |||
213 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
214 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
215 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
216 | |||
217 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
218 | _mm_store_ps(dotProductVector, | ||
219 | dotProdVal0); // Store the results back into the dot product vector | ||
220 | |||
221 | 2 | dotProduct = dotProductVector[0]; | |
222 | 2 | dotProduct += dotProductVector[1]; | |
223 | 2 | dotProduct += dotProductVector[2]; | |
224 | 2 | dotProduct += dotProductVector[3]; | |
225 | |||
226 | 2 | number = sixteenthPoints * 16; | |
227 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
228 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
229 | } | ||
230 | |||
231 | 2 | *result = dotProduct; | |
232 | 2 | } | |
233 | |||
234 | #endif /*LV_HAVE_SSE3*/ | ||
235 | |||
236 | #ifdef LV_HAVE_SSE4_1 | ||
237 | |||
238 | #include <smmintrin.h> | ||
239 | |||
240 | 2 | static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result, | |
241 | const float* input, | ||
242 | const float* taps, | ||
243 | unsigned int num_points) | ||
244 | { | ||
245 | 2 | unsigned int number = 0; | |
246 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
247 | |||
248 | 2 | float dotProduct = 0; | |
249 | 2 | const float* aPtr = input; | |
250 | 2 | const float* bPtr = taps; | |
251 | |||
252 | __m128 aVal1, bVal1, cVal1; | ||
253 | __m128 aVal2, bVal2, cVal2; | ||
254 | __m128 aVal3, bVal3, cVal3; | ||
255 | __m128 aVal4, bVal4, cVal4; | ||
256 | |||
257 | 2 | __m128 dotProdVal = _mm_setzero_ps(); | |
258 | |||
259 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
260 | |||
261 | 16382 | aVal1 = _mm_loadu_ps(aPtr); | |
262 | 16382 | aPtr += 4; | |
263 | 16382 | aVal2 = _mm_loadu_ps(aPtr); | |
264 | 16382 | aPtr += 4; | |
265 | 16382 | aVal3 = _mm_loadu_ps(aPtr); | |
266 | 16382 | aPtr += 4; | |
267 | 16382 | aVal4 = _mm_loadu_ps(aPtr); | |
268 | 16382 | aPtr += 4; | |
269 | |||
270 | 16382 | bVal1 = _mm_loadu_ps(bPtr); | |
271 | 16382 | bPtr += 4; | |
272 | 16382 | bVal2 = _mm_loadu_ps(bPtr); | |
273 | 16382 | bPtr += 4; | |
274 | 16382 | bVal3 = _mm_loadu_ps(bPtr); | |
275 | 16382 | bPtr += 4; | |
276 | 16382 | bVal4 = _mm_loadu_ps(bPtr); | |
277 | 16382 | bPtr += 4; | |
278 | |||
279 | 16382 | cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); | |
280 | 16382 | cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); | |
281 | 16382 | cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); | |
282 | 16382 | cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); | |
283 | |||
284 | 16382 | cVal1 = _mm_or_ps(cVal1, cVal2); | |
285 | 16382 | cVal3 = _mm_or_ps(cVal3, cVal4); | |
286 | 16382 | cVal1 = _mm_or_ps(cVal1, cVal3); | |
287 | |||
288 | 16382 | dotProdVal = _mm_add_ps(dotProdVal, cVal1); | |
289 | } | ||
290 | |||
291 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
292 | _mm_store_ps(dotProductVector, | ||
293 | dotProdVal); // Store the results back into the dot product vector | ||
294 | |||
295 | 2 | dotProduct = dotProductVector[0]; | |
296 | 2 | dotProduct += dotProductVector[1]; | |
297 | 2 | dotProduct += dotProductVector[2]; | |
298 | 2 | dotProduct += dotProductVector[3]; | |
299 | |||
300 | 2 | number = sixteenthPoints * 16; | |
301 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
302 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
303 | } | ||
304 | |||
305 | 2 | *result = dotProduct; | |
306 | 2 | } | |
307 | |||
308 | #endif /*LV_HAVE_SSE4_1*/ | ||
309 | |||
310 | #ifdef LV_HAVE_AVX | ||
311 | |||
312 | #include <immintrin.h> | ||
313 | |||
314 | 2 | static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result, | |
315 | const float* input, | ||
316 | const float* taps, | ||
317 | unsigned int num_points) | ||
318 | { | ||
319 | |||
320 | 2 | unsigned int number = 0; | |
321 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
322 | |||
323 | 2 | float dotProduct = 0; | |
324 | 2 | const float* aPtr = input; | |
325 | 2 | const float* bPtr = taps; | |
326 | |||
327 | __m256 a0Val, a1Val; | ||
328 | __m256 b0Val, b1Val; | ||
329 | __m256 c0Val, c1Val; | ||
330 | |||
331 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
332 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
333 | |||
334 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
335 | |||
336 | 16382 | a0Val = _mm256_loadu_ps(aPtr); | |
337 | 32764 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
338 | 16382 | b0Val = _mm256_loadu_ps(bPtr); | |
339 | 32764 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
340 | |||
341 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
342 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
343 | |||
344 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
345 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
346 | |||
347 | 16382 | aPtr += 16; | |
348 | 16382 | bPtr += 16; | |
349 | } | ||
350 | |||
351 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
352 | |||
353 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
354 | |||
355 | _mm256_storeu_ps(dotProductVector, | ||
356 | dotProdVal0); // Store the results back into the dot product vector | ||
357 | |||
358 | 2 | dotProduct = dotProductVector[0]; | |
359 | 2 | dotProduct += dotProductVector[1]; | |
360 | 2 | dotProduct += dotProductVector[2]; | |
361 | 2 | dotProduct += dotProductVector[3]; | |
362 | 2 | dotProduct += dotProductVector[4]; | |
363 | 2 | dotProduct += dotProductVector[5]; | |
364 | 2 | dotProduct += dotProductVector[6]; | |
365 | 2 | dotProduct += dotProductVector[7]; | |
366 | |||
367 | 2 | number = sixteenthPoints * 16; | |
368 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
369 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
370 | } | ||
371 | |||
372 | 2 | *result = dotProduct; | |
373 | 2 | } | |
374 | |||
375 | #endif /*LV_HAVE_AVX*/ | ||
376 | |||
377 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
378 | #include <immintrin.h> | ||
379 | 2 | static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result, | |
380 | const float* input, | ||
381 | const float* taps, | ||
382 | unsigned int num_points) | ||
383 | { | ||
384 | unsigned int number; | ||
385 | 2 | const unsigned int eighthPoints = num_points / 8; | |
386 | |||
387 | 2 | const float* aPtr = input; | |
388 | 2 | const float* bPtr = taps; | |
389 | |||
390 | 2 | __m256 dotProdVal = _mm256_setzero_ps(); | |
391 | __m256 aVal1, bVal1; | ||
392 | |||
393 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < eighthPoints; number++) { |
394 | |||
395 | 32766 | aVal1 = _mm256_loadu_ps(aPtr); | |
396 | 32766 | bVal1 = _mm256_loadu_ps(bPtr); | |
397 | 32766 | aPtr += 8; | |
398 | 32766 | bPtr += 8; | |
399 | |||
400 | 32766 | dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); | |
401 | } | ||
402 | |||
403 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
404 | _mm256_storeu_ps(dotProductVector, | ||
405 | dotProdVal); // Store the results back into the dot product vector | ||
406 | |||
407 | 2 | float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
408 | 2 | dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + | |
409 | 2 | dotProductVector[6] + dotProductVector[7]; | |
410 | |||
411 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (number = eighthPoints * 8; number < num_points; number++) { |
412 | 14 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
413 | } | ||
414 | |||
415 | 2 | *result = dotProduct; | |
416 | 2 | } | |
417 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ | ||
418 | |||
419 | #if LV_HAVE_AVX512F | ||
420 | #include <immintrin.h> | ||
421 | ✗ | static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result, | |
422 | const float* input, | ||
423 | const float* taps, | ||
424 | unsigned int num_points) | ||
425 | { | ||
426 | unsigned int number; | ||
427 | ✗ | const unsigned int sixteenthPoints = num_points / 16; | |
428 | |||
429 | ✗ | const float* aPtr = input; | |
430 | ✗ | const float* bPtr = taps; | |
431 | |||
432 | ✗ | __m512 dotProdVal = _mm512_setzero_ps(); | |
433 | __m512 aVal1, bVal1; | ||
434 | |||
435 | ✗ | for (number = 0; number < sixteenthPoints; number++) { | |
436 | |||
437 | ✗ | aVal1 = _mm512_loadu_ps(aPtr); | |
438 | ✗ | bVal1 = _mm512_loadu_ps(bPtr); | |
439 | ✗ | aPtr += 16; | |
440 | ✗ | bPtr += 16; | |
441 | |||
442 | ✗ | dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); | |
443 | } | ||
444 | |||
445 | __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; | ||
446 | _mm512_storeu_ps(dotProductVector, | ||
447 | dotProdVal); // Store the results back into the dot product vector | ||
448 | |||
449 | ✗ | float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
450 | ✗ | dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + | |
451 | ✗ | dotProductVector[6] + dotProductVector[7] + dotProductVector[8] + | |
452 | ✗ | dotProductVector[9] + dotProductVector[10] + dotProductVector[11] + | |
453 | ✗ | dotProductVector[12] + dotProductVector[13] + | |
454 | ✗ | dotProductVector[14] + dotProductVector[15]; | |
455 | |||
456 | ✗ | for (number = sixteenthPoints * 16; number < num_points; number++) { | |
457 | ✗ | dotProduct += ((*aPtr++) * (*bPtr++)); | |
458 | } | ||
459 | |||
460 | ✗ | *result = dotProduct; | |
461 | ✗ | } | |
462 | #endif /* LV_HAVE_AVX512F */ | ||
463 | |||
464 | #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ | ||
465 | |||
466 | #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H | ||
467 | #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H | ||
468 | |||
469 | #include <stdio.h> | ||
470 | #include <volk/volk_common.h> | ||
471 | |||
472 | |||
473 | #ifdef LV_HAVE_GENERIC | ||
474 | |||
475 | |||
476 | 2 | static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result, | |
477 | const float* input, | ||
478 | const float* taps, | ||
479 | unsigned int num_points) | ||
480 | { | ||
481 | |||
482 | 2 | float dotProduct = 0; | |
483 | 2 | const float* aPtr = input; | |
484 | 2 | const float* bPtr = taps; | |
485 | 2 | unsigned int number = 0; | |
486 | |||
487 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
488 | 262142 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
489 | } | ||
490 | |||
491 | 2 | *result = dotProduct; | |
492 | 2 | } | |
493 | |||
494 | #endif /*LV_HAVE_GENERIC*/ | ||
495 | |||
496 | |||
497 | #ifdef LV_HAVE_SSE | ||
498 | |||
499 | |||
500 | 2 | static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result, | |
501 | const float* input, | ||
502 | const float* taps, | ||
503 | unsigned int num_points) | ||
504 | { | ||
505 | |||
506 | 2 | unsigned int number = 0; | |
507 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
508 | |||
509 | 2 | float dotProduct = 0; | |
510 | 2 | const float* aPtr = input; | |
511 | 2 | const float* bPtr = taps; | |
512 | |||
513 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
514 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
515 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
516 | |||
517 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
518 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
519 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
520 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
521 | |||
522 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
523 | |||
524 | 16382 | a0Val = _mm_load_ps(aPtr); | |
525 | 16382 | a1Val = _mm_load_ps(aPtr + 4); | |
526 | 16382 | a2Val = _mm_load_ps(aPtr + 8); | |
527 | 32764 | a3Val = _mm_load_ps(aPtr + 12); | |
528 | 16382 | b0Val = _mm_load_ps(bPtr); | |
529 | 16382 | b1Val = _mm_load_ps(bPtr + 4); | |
530 | 16382 | b2Val = _mm_load_ps(bPtr + 8); | |
531 | 32764 | b3Val = _mm_load_ps(bPtr + 12); | |
532 | |||
533 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
534 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
535 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
536 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
537 | |||
538 | 16382 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
539 | 16382 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
540 | 16382 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
541 | 16382 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
542 | |||
543 | 16382 | aPtr += 16; | |
544 | 16382 | bPtr += 16; | |
545 | } | ||
546 | |||
547 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
548 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
549 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
550 | |||
551 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
552 | |||
553 | _mm_store_ps(dotProductVector, | ||
554 | dotProdVal0); // Store the results back into the dot product vector | ||
555 | |||
556 | 2 | dotProduct = dotProductVector[0]; | |
557 | 2 | dotProduct += dotProductVector[1]; | |
558 | 2 | dotProduct += dotProductVector[2]; | |
559 | 2 | dotProduct += dotProductVector[3]; | |
560 | |||
561 | 2 | number = sixteenthPoints * 16; | |
562 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
563 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
564 | } | ||
565 | |||
566 | 2 | *result = dotProduct; | |
567 | 2 | } | |
568 | |||
569 | #endif /*LV_HAVE_SSE*/ | ||
570 | |||
571 | #ifdef LV_HAVE_SSE3 | ||
572 | |||
573 | #include <pmmintrin.h> | ||
574 | |||
575 | 2 | static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result, | |
576 | const float* input, | ||
577 | const float* taps, | ||
578 | unsigned int num_points) | ||
579 | { | ||
580 | 2 | unsigned int number = 0; | |
581 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
582 | |||
583 | 2 | float dotProduct = 0; | |
584 | 2 | const float* aPtr = input; | |
585 | 2 | const float* bPtr = taps; | |
586 | |||
587 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
588 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
589 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
590 | |||
591 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
592 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
593 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
594 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
595 | |||
596 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
597 | |||
598 | 16382 | a0Val = _mm_load_ps(aPtr); | |
599 | 16382 | a1Val = _mm_load_ps(aPtr + 4); | |
600 | 16382 | a2Val = _mm_load_ps(aPtr + 8); | |
601 | 32764 | a3Val = _mm_load_ps(aPtr + 12); | |
602 | 16382 | b0Val = _mm_load_ps(bPtr); | |
603 | 16382 | b1Val = _mm_load_ps(bPtr + 4); | |
604 | 16382 | b2Val = _mm_load_ps(bPtr + 8); | |
605 | 32764 | b3Val = _mm_load_ps(bPtr + 12); | |
606 | |||
607 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
608 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
609 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
610 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
611 | |||
612 | 16382 | dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); | |
613 | 16382 | dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); | |
614 | 16382 | dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); | |
615 | 16382 | dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); | |
616 | |||
617 | 16382 | aPtr += 16; | |
618 | 16382 | bPtr += 16; | |
619 | } | ||
620 | |||
621 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
622 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
623 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
624 | |||
625 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
626 | _mm_store_ps(dotProductVector, | ||
627 | dotProdVal0); // Store the results back into the dot product vector | ||
628 | |||
629 | 2 | dotProduct = dotProductVector[0]; | |
630 | 2 | dotProduct += dotProductVector[1]; | |
631 | 2 | dotProduct += dotProductVector[2]; | |
632 | 2 | dotProduct += dotProductVector[3]; | |
633 | |||
634 | 2 | number = sixteenthPoints * 16; | |
635 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
636 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
637 | } | ||
638 | |||
639 | 2 | *result = dotProduct; | |
640 | 2 | } | |
641 | |||
642 | #endif /*LV_HAVE_SSE3*/ | ||
643 | |||
644 | #ifdef LV_HAVE_SSE4_1 | ||
645 | |||
646 | #include <smmintrin.h> | ||
647 | |||
648 | 2 | static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result, | |
649 | const float* input, | ||
650 | const float* taps, | ||
651 | unsigned int num_points) | ||
652 | { | ||
653 | 2 | unsigned int number = 0; | |
654 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
655 | |||
656 | 2 | float dotProduct = 0; | |
657 | 2 | const float* aPtr = input; | |
658 | 2 | const float* bPtr = taps; | |
659 | |||
660 | __m128 aVal1, bVal1, cVal1; | ||
661 | __m128 aVal2, bVal2, cVal2; | ||
662 | __m128 aVal3, bVal3, cVal3; | ||
663 | __m128 aVal4, bVal4, cVal4; | ||
664 | |||
665 | 2 | __m128 dotProdVal = _mm_setzero_ps(); | |
666 | |||
667 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
668 | |||
669 | 16382 | aVal1 = _mm_load_ps(aPtr); | |
670 | 16382 | aPtr += 4; | |
671 | 16382 | aVal2 = _mm_load_ps(aPtr); | |
672 | 16382 | aPtr += 4; | |
673 | 16382 | aVal3 = _mm_load_ps(aPtr); | |
674 | 16382 | aPtr += 4; | |
675 | 16382 | aVal4 = _mm_load_ps(aPtr); | |
676 | 16382 | aPtr += 4; | |
677 | |||
678 | 16382 | bVal1 = _mm_load_ps(bPtr); | |
679 | 16382 | bPtr += 4; | |
680 | 16382 | bVal2 = _mm_load_ps(bPtr); | |
681 | 16382 | bPtr += 4; | |
682 | 16382 | bVal3 = _mm_load_ps(bPtr); | |
683 | 16382 | bPtr += 4; | |
684 | 16382 | bVal4 = _mm_load_ps(bPtr); | |
685 | 16382 | bPtr += 4; | |
686 | |||
687 | 16382 | cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); | |
688 | 16382 | cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); | |
689 | 16382 | cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); | |
690 | 16382 | cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); | |
691 | |||
692 | 16382 | cVal1 = _mm_or_ps(cVal1, cVal2); | |
693 | 16382 | cVal3 = _mm_or_ps(cVal3, cVal4); | |
694 | 16382 | cVal1 = _mm_or_ps(cVal1, cVal3); | |
695 | |||
696 | 16382 | dotProdVal = _mm_add_ps(dotProdVal, cVal1); | |
697 | } | ||
698 | |||
699 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
700 | _mm_store_ps(dotProductVector, | ||
701 | dotProdVal); // Store the results back into the dot product vector | ||
702 | |||
703 | 2 | dotProduct = dotProductVector[0]; | |
704 | 2 | dotProduct += dotProductVector[1]; | |
705 | 2 | dotProduct += dotProductVector[2]; | |
706 | 2 | dotProduct += dotProductVector[3]; | |
707 | |||
708 | 2 | number = sixteenthPoints * 16; | |
709 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
710 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
711 | } | ||
712 | |||
713 | 2 | *result = dotProduct; | |
714 | 2 | } | |
715 | |||
716 | #endif /*LV_HAVE_SSE4_1*/ | ||
717 | |||
718 | #ifdef LV_HAVE_AVX | ||
719 | |||
720 | #include <immintrin.h> | ||
721 | |||
722 | 2 | static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result, | |
723 | const float* input, | ||
724 | const float* taps, | ||
725 | unsigned int num_points) | ||
726 | { | ||
727 | |||
728 | 2 | unsigned int number = 0; | |
729 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
730 | |||
731 | 2 | float dotProduct = 0; | |
732 | 2 | const float* aPtr = input; | |
733 | 2 | const float* bPtr = taps; | |
734 | |||
735 | __m256 a0Val, a1Val; | ||
736 | __m256 b0Val, b1Val; | ||
737 | __m256 c0Val, c1Val; | ||
738 | |||
739 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
740 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
741 | |||
742 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
743 | |||
744 | 16382 | a0Val = _mm256_load_ps(aPtr); | |
745 | 32764 | a1Val = _mm256_load_ps(aPtr + 8); | |
746 | 16382 | b0Val = _mm256_load_ps(bPtr); | |
747 | 32764 | b1Val = _mm256_load_ps(bPtr + 8); | |
748 | |||
749 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
750 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
751 | |||
752 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
753 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
754 | |||
755 | 16382 | aPtr += 16; | |
756 | 16382 | bPtr += 16; | |
757 | } | ||
758 | |||
759 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
760 | |||
761 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
762 | |||
763 | _mm256_store_ps(dotProductVector, | ||
764 | dotProdVal0); // Store the results back into the dot product vector | ||
765 | |||
766 | 2 | dotProduct = dotProductVector[0]; | |
767 | 2 | dotProduct += dotProductVector[1]; | |
768 | 2 | dotProduct += dotProductVector[2]; | |
769 | 2 | dotProduct += dotProductVector[3]; | |
770 | 2 | dotProduct += dotProductVector[4]; | |
771 | 2 | dotProduct += dotProductVector[5]; | |
772 | 2 | dotProduct += dotProductVector[6]; | |
773 | 2 | dotProduct += dotProductVector[7]; | |
774 | |||
775 | 2 | number = sixteenthPoints * 16; | |
776 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
777 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
778 | } | ||
779 | |||
780 | 2 | *result = dotProduct; | |
781 | 2 | } | |
782 | #endif /*LV_HAVE_AVX*/ | ||
783 | |||
784 | |||
785 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
786 | #include <immintrin.h> | ||
787 | 2 | static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result, | |
788 | const float* input, | ||
789 | const float* taps, | ||
790 | unsigned int num_points) | ||
791 | { | ||
792 | unsigned int number; | ||
793 | 2 | const unsigned int eighthPoints = num_points / 8; | |
794 | |||
795 | 2 | const float* aPtr = input; | |
796 | 2 | const float* bPtr = taps; | |
797 | |||
798 | 2 | __m256 dotProdVal = _mm256_setzero_ps(); | |
799 | __m256 aVal1, bVal1; | ||
800 | |||
801 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < eighthPoints; number++) { |
802 | |||
803 | 32766 | aVal1 = _mm256_load_ps(aPtr); | |
804 | 32766 | bVal1 = _mm256_load_ps(bPtr); | |
805 | 32766 | aPtr += 8; | |
806 | 32766 | bPtr += 8; | |
807 | |||
808 | 32766 | dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); | |
809 | } | ||
810 | |||
811 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
812 | _mm256_store_ps(dotProductVector, | ||
813 | dotProdVal); // Store the results back into the dot product vector | ||
814 | |||
815 | 2 | float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
816 | 2 | dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + | |
817 | 2 | dotProductVector[6] + dotProductVector[7]; | |
818 | |||
819 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (number = eighthPoints * 8; number < num_points; number++) { |
820 | 14 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
821 | } | ||
822 | |||
823 | 2 | *result = dotProduct; | |
824 | 2 | } | |
825 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ | ||
826 | |||
827 | #if LV_HAVE_AVX512F | ||
828 | #include <immintrin.h> | ||
829 | ✗ | static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result, | |
830 | const float* input, | ||
831 | const float* taps, | ||
832 | unsigned int num_points) | ||
833 | { | ||
834 | unsigned int number; | ||
835 | ✗ | const unsigned int sixteenthPoints = num_points / 16; | |
836 | |||
837 | ✗ | const float* aPtr = input; | |
838 | ✗ | const float* bPtr = taps; | |
839 | |||
840 | ✗ | __m512 dotProdVal = _mm512_setzero_ps(); | |
841 | __m512 aVal1, bVal1; | ||
842 | |||
843 | ✗ | for (number = 0; number < sixteenthPoints; number++) { | |
844 | |||
845 | ✗ | aVal1 = _mm512_load_ps(aPtr); | |
846 | ✗ | bVal1 = _mm512_load_ps(bPtr); | |
847 | ✗ | aPtr += 16; | |
848 | ✗ | bPtr += 16; | |
849 | |||
850 | ✗ | dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); | |
851 | } | ||
852 | |||
853 | __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; | ||
854 | _mm512_store_ps(dotProductVector, | ||
855 | dotProdVal); // Store the results back into the dot product vector | ||
856 | |||
857 | ✗ | float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
858 | ✗ | dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + | |
859 | ✗ | dotProductVector[6] + dotProductVector[7] + dotProductVector[8] + | |
860 | ✗ | dotProductVector[9] + dotProductVector[10] + dotProductVector[11] + | |
861 | ✗ | dotProductVector[12] + dotProductVector[13] + | |
862 | ✗ | dotProductVector[14] + dotProductVector[15]; | |
863 | |||
864 | ✗ | for (number = sixteenthPoints * 16; number < num_points; number++) { | |
865 | ✗ | dotProduct += ((*aPtr++) * (*bPtr++)); | |
866 | } | ||
867 | |||
868 | ✗ | *result = dotProduct; | |
869 | ✗ | } | |
870 | #endif /* LV_HAVE_AVX512F */ | ||
871 | |||
872 | #ifdef LV_HAVE_NEON | ||
873 | #include <arm_neon.h> | ||
874 | |||
875 | static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result, | ||
876 | const float* input, | ||
877 | const float* taps, | ||
878 | unsigned int num_points) | ||
879 | { | ||
880 | |||
881 | unsigned int quarter_points = num_points / 16; | ||
882 | float dotProduct = 0; | ||
883 | const float* aPtr = input; | ||
884 | const float* bPtr = taps; | ||
885 | unsigned int number = 0; | ||
886 | |||
887 | float32x4x4_t a_val, b_val, accumulator0; | ||
888 | accumulator0.val[0] = vdupq_n_f32(0); | ||
889 | accumulator0.val[1] = vdupq_n_f32(0); | ||
890 | accumulator0.val[2] = vdupq_n_f32(0); | ||
891 | accumulator0.val[3] = vdupq_n_f32(0); | ||
892 | // factor of 4 loop unroll with independent accumulators | ||
893 | // uses 12 out of 16 neon q registers | ||
894 | for (number = 0; number < quarter_points; ++number) { | ||
895 | a_val = vld4q_f32(aPtr); | ||
896 | b_val = vld4q_f32(bPtr); | ||
897 | accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]); | ||
898 | accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]); | ||
899 | accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]); | ||
900 | accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]); | ||
901 | aPtr += 16; | ||
902 | bPtr += 16; | ||
903 | } | ||
904 | accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]); | ||
905 | accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]); | ||
906 | accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]); | ||
907 | __VOLK_ATTR_ALIGNED(32) float accumulator[4]; | ||
908 | vst1q_f32(accumulator, accumulator0.val[0]); | ||
909 | dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; | ||
910 | |||
911 | for (number = quarter_points * 16; number < num_points; number++) { | ||
912 | dotProduct += ((*aPtr++) * (*bPtr++)); | ||
913 | } | ||
914 | |||
915 | *result = dotProduct; | ||
916 | } | ||
917 | |||
918 | #endif | ||
919 | |||
920 | |||
921 | #ifdef LV_HAVE_NEON | ||
922 | static inline void volk_32f_x2_dot_prod_32f_neon(float* result, | ||
923 | const float* input, | ||
924 | const float* taps, | ||
925 | unsigned int num_points) | ||
926 | { | ||
927 | |||
928 | unsigned int quarter_points = num_points / 8; | ||
929 | float dotProduct = 0; | ||
930 | const float* aPtr = input; | ||
931 | const float* bPtr = taps; | ||
932 | unsigned int number = 0; | ||
933 | |||
934 | float32x4x2_t a_val, b_val, accumulator_val; | ||
935 | accumulator_val.val[0] = vdupq_n_f32(0); | ||
936 | accumulator_val.val[1] = vdupq_n_f32(0); | ||
937 | // factor of 2 loop unroll with independent accumulators | ||
938 | for (number = 0; number < quarter_points; ++number) { | ||
939 | a_val = vld2q_f32(aPtr); | ||
940 | b_val = vld2q_f32(bPtr); | ||
941 | accumulator_val.val[0] = | ||
942 | vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]); | ||
943 | accumulator_val.val[1] = | ||
944 | vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]); | ||
945 | aPtr += 8; | ||
946 | bPtr += 8; | ||
947 | } | ||
948 | accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]); | ||
949 | __VOLK_ATTR_ALIGNED(32) float accumulator[4]; | ||
950 | vst1q_f32(accumulator, accumulator_val.val[0]); | ||
951 | dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; | ||
952 | |||
953 | for (number = quarter_points * 8; number < num_points; number++) { | ||
954 | dotProduct += ((*aPtr++) * (*bPtr++)); | ||
955 | } | ||
956 | |||
957 | *result = dotProduct; | ||
958 | } | ||
959 | |||
960 | #endif /* LV_HAVE_NEON */ | ||
961 | |||
962 | #ifdef LV_HAVE_NEONV7 | ||
963 | extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, | ||
964 | const float* aVector, | ||
965 | const float* bVector, | ||
966 | unsigned int num_points); | ||
967 | #endif /* LV_HAVE_NEONV7 */ | ||
968 | |||
969 | #ifdef LV_HAVE_NEONV7 | ||
970 | extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, | ||
971 | const float* aVector, | ||
972 | const float* bVector, | ||
973 | unsigned int num_points); | ||
974 | #endif /* LV_HAVE_NEONV7 */ | ||
975 | |||
976 | #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ | ||
977 |