Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_32f_dot_prod_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * This block computes the dot product (or inner product) between two | ||
16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
17 | * num_points taps, the result is the sum of products between the two | ||
18 | * vectors. The result is a single value stored in the \p result | ||
19 | * address and will be complex. | ||
20 | * | ||
21 | * <b>Dispatcher Prototype</b> | ||
22 | * \code | ||
23 | * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float | ||
24 | * * taps, unsigned int num_points) \endcode | ||
25 | * | ||
26 | * \b Inputs | ||
27 | * \li input: vector of complex samples | ||
28 | * \li taps: floating point taps | ||
29 | * \li num_points: number of samples in both \p input and \p taps | ||
30 | * | ||
31 | * \b Outputs | ||
32 | * \li result: pointer to a complex value to hold the dot product result. | ||
33 | * | ||
34 | * \b Example | ||
35 | * \code | ||
36 | * int N = 10000; | ||
37 | * lv_32fc_t y; | ||
38 | * lv_32fc_t *x = (lv_32fc_t*)volk_malloc(N*sizeof(lv_32fc_t), volk_get_alignment()); | ||
39 | * float *t = (float*)volk_malloc(N*sizeof(float), volk_get_alignment()); | ||
40 | * | ||
41 | * <populate x and t with some values> | ||
42 | * | ||
43 | * volk_32fc_dot_prod_32fc(&y, x, t, N); | ||
44 | * | ||
45 | * volk_free(x); | ||
46 | * volk_free(t); | ||
47 | * \endcode | ||
48 | */ | ||
49 | |||
50 | #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H | ||
51 | #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H | ||
52 | |||
53 | #include <stdio.h> | ||
54 | #include <volk/volk_common.h> | ||
55 | |||
56 | #ifdef LV_HAVE_GENERIC | ||
57 | |||
58 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, | |
59 | const lv_32fc_t* input, | ||
60 | const float* taps, | ||
61 | unsigned int num_points) | ||
62 | { | ||
63 | |||
64 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
65 | 2 | const float* aPtr = (float*)input; | |
66 | 2 | const float* bPtr = taps; | |
67 | 2 | unsigned int number = 0; | |
68 | |||
69 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
70 | 262142 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
71 | 262142 | aPtr += 2; | |
72 | 262142 | bPtr += 1; | |
73 | } | ||
74 | |||
75 | 2 | *result = returnValue; | |
76 | 2 | } | |
77 | |||
78 | #endif /*LV_HAVE_GENERIC*/ | ||
79 | |||
80 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
81 | |||
82 | #include <immintrin.h> | ||
83 | |||
84 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, | |
85 | const lv_32fc_t* input, | ||
86 | const float* taps, | ||
87 | unsigned int num_points) | ||
88 | { | ||
89 | |||
90 | 2 | unsigned int number = 0; | |
91 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
92 | |||
93 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
94 | 2 | const float* aPtr = (float*)input; | |
95 | 2 | const float* bPtr = taps; | |
96 | |||
97 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
98 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
99 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
100 | |||
101 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
102 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
103 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
104 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
105 | |||
106 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
107 | |||
108 | 16382 | a0Val = _mm256_load_ps(aPtr); | |
109 | 16382 | a1Val = _mm256_load_ps(aPtr + 8); | |
110 | 16382 | a2Val = _mm256_load_ps(aPtr + 16); | |
111 | 32764 | a3Val = _mm256_load_ps(aPtr + 24); | |
112 | |||
113 | 16382 | x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
114 | 32764 | x1Val = _mm256_load_ps(bPtr + 8); | |
115 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
116 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
117 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
118 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
119 | |||
120 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
121 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
122 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
123 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
124 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
125 | |||
126 | 16382 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
127 | 16382 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
128 | 16382 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
129 | 16382 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
130 | |||
131 | 16382 | aPtr += 32; | |
132 | 16382 | bPtr += 16; | |
133 | } | ||
134 | |||
135 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
136 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
137 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
138 | |||
139 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
140 | |||
141 | _mm256_store_ps(dotProductVector, | ||
142 | dotProdVal0); // Store the results back into the dot product vector | ||
143 | |||
144 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
145 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
146 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
147 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
148 | |||
149 | 2 | number = sixteenthPoints * 16; | |
150 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
151 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
152 | 30 | aPtr += 2; | |
153 | 30 | bPtr += 1; | |
154 | } | ||
155 | |||
156 | 2 | *result = returnValue; | |
157 | 2 | } | |
158 | |||
159 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
160 | |||
161 | #ifdef LV_HAVE_AVX | ||
162 | |||
163 | #include <immintrin.h> | ||
164 | |||
165 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result, | |
166 | const lv_32fc_t* input, | ||
167 | const float* taps, | ||
168 | unsigned int num_points) | ||
169 | { | ||
170 | |||
171 | 2 | unsigned int number = 0; | |
172 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
173 | |||
174 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
175 | 2 | const float* aPtr = (float*)input; | |
176 | 2 | const float* bPtr = taps; | |
177 | |||
178 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
179 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
180 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
181 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
182 | |||
183 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
184 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
185 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
186 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
187 | |||
188 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
189 | |||
190 | 16382 | a0Val = _mm256_load_ps(aPtr); | |
191 | 16382 | a1Val = _mm256_load_ps(aPtr + 8); | |
192 | 16382 | a2Val = _mm256_load_ps(aPtr + 16); | |
193 | 32764 | a3Val = _mm256_load_ps(aPtr + 24); | |
194 | |||
195 | 16382 | x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
196 | 32764 | x1Val = _mm256_load_ps(bPtr + 8); | |
197 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
198 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
199 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
200 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
201 | |||
202 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
203 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
204 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
205 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
206 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
207 | |||
208 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
209 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
210 | 16382 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
211 | 16382 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
212 | |||
213 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
214 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
215 | 16382 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
216 | 16382 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
217 | |||
218 | 16382 | aPtr += 32; | |
219 | 16382 | bPtr += 16; | |
220 | } | ||
221 | |||
222 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
223 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
224 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
225 | |||
226 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
227 | |||
228 | _mm256_store_ps(dotProductVector, | ||
229 | dotProdVal0); // Store the results back into the dot product vector | ||
230 | |||
231 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
232 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
233 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
234 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
235 | |||
236 | 2 | number = sixteenthPoints * 16; | |
237 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
238 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
239 | 30 | aPtr += 2; | |
240 | 30 | bPtr += 1; | |
241 | } | ||
242 | |||
243 | 2 | *result = returnValue; | |
244 | 2 | } | |
245 | |||
246 | #endif /*LV_HAVE_AVX*/ | ||
247 | |||
248 | |||
249 | #ifdef LV_HAVE_SSE | ||
250 | |||
251 | |||
252 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result, | |
253 | const lv_32fc_t* input, | ||
254 | const float* taps, | ||
255 | unsigned int num_points) | ||
256 | { | ||
257 | |||
258 | 2 | unsigned int number = 0; | |
259 | 2 | const unsigned int eighthPoints = num_points / 8; | |
260 | |||
261 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
262 | 2 | const float* aPtr = (float*)input; | |
263 | 2 | const float* bPtr = taps; | |
264 | |||
265 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
266 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
267 | __m128 x0Val, x1Val, x2Val, x3Val; | ||
268 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
269 | |||
270 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
271 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
272 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
273 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
274 | |||
275 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
276 | |||
277 | 32766 | a0Val = _mm_load_ps(aPtr); | |
278 | 32766 | a1Val = _mm_load_ps(aPtr + 4); | |
279 | 32766 | a2Val = _mm_load_ps(aPtr + 8); | |
280 | 65532 | a3Val = _mm_load_ps(aPtr + 12); | |
281 | |||
282 | 32766 | x0Val = _mm_load_ps(bPtr); | |
283 | 32766 | x1Val = _mm_load_ps(bPtr); | |
284 | 32766 | x2Val = _mm_load_ps(bPtr + 4); | |
285 | 65532 | x3Val = _mm_load_ps(bPtr + 4); | |
286 | 32766 | b0Val = _mm_unpacklo_ps(x0Val, x1Val); | |
287 | 32766 | b1Val = _mm_unpackhi_ps(x0Val, x1Val); | |
288 | 32766 | b2Val = _mm_unpacklo_ps(x2Val, x3Val); | |
289 | 32766 | b3Val = _mm_unpackhi_ps(x2Val, x3Val); | |
290 | |||
291 | 32766 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
292 | 32766 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
293 | 32766 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
294 | 32766 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
295 | |||
296 | 32766 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
297 | 32766 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
298 | 32766 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
299 | 32766 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
300 | |||
301 | 32766 | aPtr += 16; | |
302 | 32766 | bPtr += 8; | |
303 | } | ||
304 | |||
305 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
306 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
307 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
308 | |||
309 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
310 | |||
311 | _mm_store_ps(dotProductVector, | ||
312 | dotProdVal0); // Store the results back into the dot product vector | ||
313 | |||
314 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
315 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
316 | |||
317 | 2 | number = eighthPoints * 8; | |
318 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
319 | 14 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
320 | 14 | aPtr += 2; | |
321 | 14 | bPtr += 1; | |
322 | } | ||
323 | |||
324 | 2 | *result = returnValue; | |
325 | 2 | } | |
326 | |||
327 | #endif /*LV_HAVE_SSE*/ | ||
328 | |||
329 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
330 | |||
331 | #include <immintrin.h> | ||
332 | |||
333 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, | |
334 | const lv_32fc_t* input, | ||
335 | const float* taps, | ||
336 | unsigned int num_points) | ||
337 | { | ||
338 | |||
339 | 2 | unsigned int number = 0; | |
340 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
341 | |||
342 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
343 | 2 | const float* aPtr = (float*)input; | |
344 | 2 | const float* bPtr = taps; | |
345 | |||
346 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
347 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
348 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
349 | |||
350 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
351 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
352 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
353 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
354 | |||
355 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
356 | |||
357 | 16382 | a0Val = _mm256_loadu_ps(aPtr); | |
358 | 16382 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
359 | 16382 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
360 | 32764 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
361 | |||
362 | 16382 | x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
363 | 32764 | x1Val = _mm256_loadu_ps(bPtr + 8); | |
364 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
365 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
366 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
367 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
368 | |||
369 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
370 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
371 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
372 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
373 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
374 | |||
375 | 16382 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
376 | 16382 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
377 | 16382 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
378 | 16382 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
379 | |||
380 | 16382 | aPtr += 32; | |
381 | 16382 | bPtr += 16; | |
382 | } | ||
383 | |||
384 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
385 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
386 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
387 | |||
388 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
389 | |||
390 | _mm256_store_ps(dotProductVector, | ||
391 | dotProdVal0); // Store the results back into the dot product vector | ||
392 | |||
393 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
394 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
395 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
396 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
397 | |||
398 | 2 | number = sixteenthPoints * 16; | |
399 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
400 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
401 | 30 | aPtr += 2; | |
402 | 30 | bPtr += 1; | |
403 | } | ||
404 | |||
405 | 2 | *result = returnValue; | |
406 | 2 | } | |
407 | |||
408 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
409 | |||
410 | #ifdef LV_HAVE_AVX | ||
411 | |||
412 | #include <immintrin.h> | ||
413 | |||
414 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result, | |
415 | const lv_32fc_t* input, | ||
416 | const float* taps, | ||
417 | unsigned int num_points) | ||
418 | { | ||
419 | |||
420 | 2 | unsigned int number = 0; | |
421 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
422 | |||
423 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
424 | 2 | const float* aPtr = (float*)input; | |
425 | 2 | const float* bPtr = taps; | |
426 | |||
427 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
428 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
429 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
430 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
431 | |||
432 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
433 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
434 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
435 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
436 | |||
437 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
438 | |||
439 | 16382 | a0Val = _mm256_loadu_ps(aPtr); | |
440 | 16382 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
441 | 16382 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
442 | 32764 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
443 | |||
444 | 16382 | x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
445 | 32764 | x1Val = _mm256_loadu_ps(bPtr + 8); | |
446 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
447 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
448 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
449 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
450 | |||
451 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
452 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
453 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
454 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
455 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
456 | |||
457 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
458 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
459 | 16382 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
460 | 16382 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
461 | |||
462 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
463 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
464 | 16382 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
465 | 16382 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
466 | |||
467 | 16382 | aPtr += 32; | |
468 | 16382 | bPtr += 16; | |
469 | } | ||
470 | |||
471 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
472 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
473 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
474 | |||
475 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
476 | |||
477 | _mm256_store_ps(dotProductVector, | ||
478 | dotProdVal0); // Store the results back into the dot product vector | ||
479 | |||
480 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
481 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
482 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
483 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
484 | |||
485 | 2 | number = sixteenthPoints * 16; | |
486 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
487 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
488 | 30 | aPtr += 2; | |
489 | 30 | bPtr += 1; | |
490 | } | ||
491 | |||
492 | 2 | *result = returnValue; | |
493 | 2 | } | |
494 | #endif /*LV_HAVE_AVX*/ | ||
495 | |||
496 | #ifdef LV_HAVE_NEON | ||
497 | #include <arm_neon.h> | ||
498 | |||
499 | static inline void | ||
500 | volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result, | ||
501 | const lv_32fc_t* __restrict input, | ||
502 | const float* __restrict taps, | ||
503 | unsigned int num_points) | ||
504 | { | ||
505 | |||
506 | unsigned int number; | ||
507 | const unsigned int quarterPoints = num_points / 8; | ||
508 | |||
509 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | ||
510 | const float* inputPtr = (float*)input; | ||
511 | const float* tapsPtr = taps; | ||
512 | float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; | ||
513 | float accVector_real[4]; | ||
514 | float accVector_imag[4]; | ||
515 | |||
516 | float32x4x2_t inputVector0, inputVector1; | ||
517 | float32x4_t tapsVector0, tapsVector1; | ||
518 | float32x4_t tmp_real0, tmp_imag0; | ||
519 | float32x4_t tmp_real1, tmp_imag1; | ||
520 | float32x4_t real_accumulator0, imag_accumulator0; | ||
521 | float32x4_t real_accumulator1, imag_accumulator1; | ||
522 | |||
523 | // zero out accumulators | ||
524 | // take a *float, return float32x4_t | ||
525 | real_accumulator0 = vld1q_f32(zero); | ||
526 | imag_accumulator0 = vld1q_f32(zero); | ||
527 | real_accumulator1 = vld1q_f32(zero); | ||
528 | imag_accumulator1 = vld1q_f32(zero); | ||
529 | |||
530 | for (number = 0; number < quarterPoints; number++) { | ||
531 | // load doublewords and duplicate in to second lane | ||
532 | tapsVector0 = vld1q_f32(tapsPtr); | ||
533 | tapsVector1 = vld1q_f32(tapsPtr + 4); | ||
534 | |||
535 | // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag | ||
536 | inputVector0 = vld2q_f32(inputPtr); | ||
537 | inputVector1 = vld2q_f32(inputPtr + 8); | ||
538 | // inputVector is now a struct of two vectors, 0th is real, 1st is imag | ||
539 | |||
540 | tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); | ||
541 | tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); | ||
542 | |||
543 | tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); | ||
544 | tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); | ||
545 | |||
546 | real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); | ||
547 | imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); | ||
548 | |||
549 | real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); | ||
550 | imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); | ||
551 | |||
552 | tapsPtr += 8; | ||
553 | inputPtr += 16; | ||
554 | } | ||
555 | |||
556 | real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1); | ||
557 | imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1); | ||
558 | // void vst1q_f32( float32_t * ptr, float32x4_t val); | ||
559 | // store results back to a complex (array of 2 floats) | ||
560 | vst1q_f32(accVector_real, real_accumulator0); | ||
561 | vst1q_f32(accVector_imag, imag_accumulator0); | ||
562 | returnValue += lv_cmake( | ||
563 | accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3], | ||
564 | accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]); | ||
565 | |||
566 | // clean up the remainder | ||
567 | for (number = quarterPoints * 8; number < num_points; number++) { | ||
568 | returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]); | ||
569 | inputPtr += 2; | ||
570 | tapsPtr += 1; | ||
571 | } | ||
572 | |||
573 | *result = returnValue; | ||
574 | } | ||
575 | |||
576 | #endif /*LV_HAVE_NEON*/ | ||
577 | |||
578 | #ifdef LV_HAVE_NEON | ||
579 | #include <arm_neon.h> | ||
580 | |||
581 | static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result, | ||
582 | const lv_32fc_t* __restrict input, | ||
583 | const float* __restrict taps, | ||
584 | unsigned int num_points) | ||
585 | { | ||
586 | |||
587 | unsigned int number; | ||
588 | const unsigned int quarterPoints = num_points / 4; | ||
589 | |||
590 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | ||
591 | const float* inputPtr = (float*)input; | ||
592 | const float* tapsPtr = taps; | ||
593 | float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; | ||
594 | float accVector_real[4]; | ||
595 | float accVector_imag[4]; | ||
596 | |||
597 | float32x4x2_t inputVector; | ||
598 | float32x4_t tapsVector; | ||
599 | float32x4_t tmp_real, tmp_imag; | ||
600 | float32x4_t real_accumulator, imag_accumulator; | ||
601 | |||
602 | |||
603 | // zero out accumulators | ||
604 | // take a *float, return float32x4_t | ||
605 | real_accumulator = vld1q_f32(zero); | ||
606 | imag_accumulator = vld1q_f32(zero); | ||
607 | |||
608 | for (number = 0; number < quarterPoints; number++) { | ||
609 | // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) | ||
610 | // load doublewords and duplicate in to second lane | ||
611 | tapsVector = vld1q_f32(tapsPtr); | ||
612 | |||
613 | // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag | ||
614 | inputVector = vld2q_f32(inputPtr); | ||
615 | |||
616 | tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); | ||
617 | tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); | ||
618 | |||
619 | real_accumulator = vaddq_f32(real_accumulator, tmp_real); | ||
620 | imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); | ||
621 | |||
622 | |||
623 | tapsPtr += 4; | ||
624 | inputPtr += 8; | ||
625 | } | ||
626 | |||
627 | // store results back to a complex (array of 2 floats) | ||
628 | vst1q_f32(accVector_real, real_accumulator); | ||
629 | vst1q_f32(accVector_imag, imag_accumulator); | ||
630 | returnValue += lv_cmake( | ||
631 | accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3], | ||
632 | accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]); | ||
633 | |||
634 | // clean up the remainder | ||
635 | for (number = quarterPoints * 4; number < num_points; number++) { | ||
636 | returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]); | ||
637 | inputPtr += 2; | ||
638 | tapsPtr += 1; | ||
639 | } | ||
640 | |||
641 | *result = returnValue; | ||
642 | } | ||
643 | |||
644 | #endif /*LV_HAVE_NEON*/ | ||
645 | |||
646 | #ifdef LV_HAVE_NEONV7 | ||
647 | extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result, | ||
648 | const lv_32fc_t* input, | ||
649 | const float* taps, | ||
650 | unsigned int num_points); | ||
651 | #endif /*LV_HAVE_NEONV7*/ | ||
652 | |||
653 | #ifdef LV_HAVE_NEONV7 | ||
654 | extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result, | ||
655 | const lv_32fc_t* input, | ||
656 | const float* taps, | ||
657 | unsigned int num_points); | ||
658 | #endif /*LV_HAVE_NEONV7*/ | ||
659 | |||
660 | #ifdef LV_HAVE_NEONV7 | ||
661 | extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result, | ||
662 | const lv_32fc_t* input, | ||
663 | const float* taps, | ||
664 | unsigned int num_points); | ||
665 | #endif /*LV_HAVE_NEONV7*/ | ||
666 | |||
667 | #ifdef LV_HAVE_SSE | ||
668 | |||
669 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result, | |
670 | const lv_32fc_t* input, | ||
671 | const float* taps, | ||
672 | unsigned int num_points) | ||
673 | { | ||
674 | |||
675 | 2 | unsigned int number = 0; | |
676 | 2 | const unsigned int eighthPoints = num_points / 8; | |
677 | |||
678 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
679 | 2 | const float* aPtr = (float*)input; | |
680 | 2 | const float* bPtr = taps; | |
681 | |||
682 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
683 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
684 | __m128 x0Val, x1Val, x2Val, x3Val; | ||
685 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
686 | |||
687 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
688 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
689 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
690 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
691 | |||
692 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
693 | |||
694 | 32766 | a0Val = _mm_loadu_ps(aPtr); | |
695 | 32766 | a1Val = _mm_loadu_ps(aPtr + 4); | |
696 | 32766 | a2Val = _mm_loadu_ps(aPtr + 8); | |
697 | 65532 | a3Val = _mm_loadu_ps(aPtr + 12); | |
698 | |||
699 | 32766 | x0Val = _mm_loadu_ps(bPtr); | |
700 | 32766 | x1Val = _mm_loadu_ps(bPtr); | |
701 | 32766 | x2Val = _mm_loadu_ps(bPtr + 4); | |
702 | 65532 | x3Val = _mm_loadu_ps(bPtr + 4); | |
703 | 32766 | b0Val = _mm_unpacklo_ps(x0Val, x1Val); | |
704 | 32766 | b1Val = _mm_unpackhi_ps(x0Val, x1Val); | |
705 | 32766 | b2Val = _mm_unpacklo_ps(x2Val, x3Val); | |
706 | 32766 | b3Val = _mm_unpackhi_ps(x2Val, x3Val); | |
707 | |||
708 | 32766 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
709 | 32766 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
710 | 32766 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
711 | 32766 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
712 | |||
713 | 32766 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
714 | 32766 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
715 | 32766 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
716 | 32766 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
717 | |||
718 | 32766 | aPtr += 16; | |
719 | 32766 | bPtr += 8; | |
720 | } | ||
721 | |||
722 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
723 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
724 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
725 | |||
726 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
727 | |||
728 | _mm_store_ps(dotProductVector, | ||
729 | dotProdVal0); // Store the results back into the dot product vector | ||
730 | |||
731 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
732 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
733 | |||
734 | 2 | number = eighthPoints * 8; | |
735 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
736 | 14 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
737 | 14 | aPtr += 2; | |
738 | 14 | bPtr += 1; | |
739 | } | ||
740 | |||
741 | 2 | *result = returnValue; | |
742 | 2 | } | |
743 | |||
744 | #endif /*LV_HAVE_SSE*/ | ||
745 | |||
746 | |||
747 | #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/ | ||
748 |