Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_dot_prod_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * This block computes the dot product (or inner product) between two | ||
16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
17 | * num_points taps, the result is the sum of products between the two | ||
18 | * vectors. The result is a single value stored in the \p result | ||
19 | * address and is returned as a complex float. | ||
20 | * | ||
21 | * <b>Dispatcher Prototype</b> | ||
22 | * \code | ||
23 | * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const | ||
24 | * lv_32fc_t* taps, unsigned int num_points) \endcode | ||
25 | * | ||
26 | * \b Inputs | ||
27 | * \li input: vector of complex floats. | ||
28 | * \li taps: complex float taps. | ||
29 | * \li num_points: number of samples in both \p input and \p taps. | ||
30 | * | ||
31 | * \b Outputs | ||
32 | * \li result: pointer to a complex float value to hold the dot product result. | ||
33 | * | ||
34 | * \b Example | ||
35 | * \code | ||
36 | * int N = 10000; | ||
37 | * | ||
38 | * <FIXME> | ||
39 | * | ||
40 | * volk_32fc_x2_dot_prod_32fc(); | ||
41 | * | ||
42 | * \endcode | ||
43 | */ | ||
44 | |||
45 | #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H | ||
46 | #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H | ||
47 | |||
48 | #include <stdio.h> | ||
49 | #include <string.h> | ||
50 | #include <volk/volk_common.h> | ||
51 | #include <volk/volk_complex.h> | ||
52 | |||
53 | |||
54 | #ifdef LV_HAVE_RISCV64 | ||
55 | extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result, | ||
56 | const lv_32fc_t* input, | ||
57 | const lv_32fc_t* taps, | ||
58 | unsigned int num_points); | ||
59 | #endif | ||
60 | |||
61 | #ifdef LV_HAVE_GENERIC | ||
62 | |||
63 | |||
64 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, | |
65 | const lv_32fc_t* input, | ||
66 | const lv_32fc_t* taps, | ||
67 | unsigned int num_points) | ||
68 | { | ||
69 | |||
70 | 2 | float* res = (float*)result; | |
71 | 2 | float* in = (float*)input; | |
72 | 2 | float* tp = (float*)taps; | |
73 | 2 | unsigned int n_2_ccomplex_blocks = num_points / 2; | |
74 | |||
75 | 2 | float sum0[2] = { 0, 0 }; | |
76 | 2 | float sum1[2] = { 0, 0 }; | |
77 | 2 | unsigned int i = 0; | |
78 | |||
79 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
80 | 131070 | sum0[0] += in[0] * tp[0] - in[1] * tp[1]; | |
81 | 131070 | sum0[1] += in[0] * tp[1] + in[1] * tp[0]; | |
82 | 131070 | sum1[0] += in[2] * tp[2] - in[3] * tp[3]; | |
83 | 131070 | sum1[1] += in[2] * tp[3] + in[3] * tp[2]; | |
84 | |||
85 | 131070 | in += 4; | |
86 | 131070 | tp += 4; | |
87 | } | ||
88 | |||
89 | 2 | res[0] = sum0[0] + sum1[0]; | |
90 | 2 | res[1] = sum0[1] + sum1[1]; | |
91 | |||
92 | // Cleanup if we had an odd number of points | ||
93 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1) { |
94 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
95 | } | ||
96 | 2 | } | |
97 | |||
98 | #endif /*LV_HAVE_GENERIC*/ | ||
99 | |||
100 | |||
101 | #if LV_HAVE_SSE && LV_HAVE_64 | ||
102 | |||
103 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, | |
104 | const lv_32fc_t* input, | ||
105 | const lv_32fc_t* taps, | ||
106 | unsigned int num_points) | ||
107 | { | ||
108 | |||
109 | 2 | const unsigned int num_bytes = num_points * 8; | |
110 | 2 | unsigned int isodd = num_points & 1; | |
111 | |||
112 | 2 | __VOLK_ASM( | |
113 | "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" | ||
114 | "# const float *taps, unsigned num_bytes)\n\t" | ||
115 | "# float sum0 = 0;\n\t" | ||
116 | "# float sum1 = 0;\n\t" | ||
117 | "# float sum2 = 0;\n\t" | ||
118 | "# float sum3 = 0;\n\t" | ||
119 | "# do {\n\t" | ||
120 | "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" | ||
121 | "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" | ||
122 | "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" | ||
123 | "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" | ||
124 | "# input += 4;\n\t" | ||
125 | "# taps += 4; \n\t" | ||
126 | "# } while (--n_2_ccomplex_blocks != 0);\n\t" | ||
127 | "# result[0] = sum0 + sum2;\n\t" | ||
128 | "# result[1] = sum1 + sum3;\n\t" | ||
129 | "# TODO: prefetch and better scheduling\n\t" | ||
130 | " xor %%r9, %%r9\n\t" | ||
131 | " xor %%r10, %%r10\n\t" | ||
132 | " movq %%rcx, %%rax\n\t" | ||
133 | " movq %%rcx, %%r8\n\t" | ||
134 | " movq %[rsi], %%r9\n\t" | ||
135 | " movq %[rdx], %%r10\n\t" | ||
136 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
137 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
138 | " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" | ||
139 | " shr $4, %%r8\n\t" | ||
140 | " jmp .%=L1_test\n\t" | ||
141 | " # 4 taps / loop\n\t" | ||
142 | " # something like ?? cycles / loop\n\t" | ||
143 | ".%=Loop1: \n\t" | ||
144 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
145 | "# movups (%%r9), %%xmmA\n\t" | ||
146 | "# movups (%%r10), %%xmmB\n\t" | ||
147 | "# movups %%xmmA, %%xmmZ\n\t" | ||
148 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
149 | "# mulps %%xmmB, %%xmmA\n\t" | ||
150 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
151 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
152 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
153 | "# movups %%xmmA, %%xmmZ\n\t" | ||
154 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
155 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
156 | "# movups %%xmmZ, %%xmmY\n\t" | ||
157 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
158 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
159 | "# addps %%xmmZ, %%xmmA\n\t" | ||
160 | "# addps %%xmmA, %%xmmC\n\t" | ||
161 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
162 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
163 | " movups 0(%%r9), %%xmm0\n\t" | ||
164 | " movups 16(%%r9), %%xmm1\n\t" | ||
165 | " movups %%xmm0, %%xmm4\n\t" | ||
166 | " movups 0(%%r10), %%xmm2\n\t" | ||
167 | " mulps %%xmm2, %%xmm0\n\t" | ||
168 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
169 | " movups 16(%%r10), %%xmm3\n\t" | ||
170 | " movups %%xmm1, %%xmm5\n\t" | ||
171 | " addps %%xmm0, %%xmm6\n\t" | ||
172 | " mulps %%xmm3, %%xmm1\n\t" | ||
173 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
174 | " addps %%xmm1, %%xmm6\n\t" | ||
175 | " mulps %%xmm4, %%xmm2\n\t" | ||
176 | " addps %%xmm2, %%xmm7\n\t" | ||
177 | " mulps %%xmm5, %%xmm3\n\t" | ||
178 | " add $32, %%r9\n\t" | ||
179 | " addps %%xmm3, %%xmm7\n\t" | ||
180 | " add $32, %%r10\n\t" | ||
181 | ".%=L1_test:\n\t" | ||
182 | " dec %%rax\n\t" | ||
183 | " jge .%=Loop1\n\t" | ||
184 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
185 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
186 | " # If so, we've got 2 more taps to do.\n\t" | ||
187 | " and $1, %%r8\n\t" | ||
188 | " je .%=Leven\n\t" | ||
189 | " # The count was odd, do 2 more taps.\n\t" | ||
190 | " movups 0(%%r9), %%xmm0\n\t" | ||
191 | " movups %%xmm0, %%xmm4\n\t" | ||
192 | " movups 0(%%r10), %%xmm2\n\t" | ||
193 | " mulps %%xmm2, %%xmm0\n\t" | ||
194 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
195 | " addps %%xmm0, %%xmm6\n\t" | ||
196 | " mulps %%xmm4, %%xmm2\n\t" | ||
197 | " addps %%xmm2, %%xmm7\n\t" | ||
198 | ".%=Leven:\n\t" | ||
199 | " # neg inversor\n\t" | ||
200 | " xorps %%xmm1, %%xmm1\n\t" | ||
201 | " mov $0x80000000, %%r9\n\t" | ||
202 | " movd %%r9, %%xmm1\n\t" | ||
203 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
204 | " # pfpnacc\n\t" | ||
205 | " xorps %%xmm1, %%xmm6\n\t" | ||
206 | " movups %%xmm6, %%xmm2\n\t" | ||
207 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
208 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
209 | " movups %%xmm2, %%xmm3\n\t" | ||
210 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
211 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
212 | " addps %%xmm2, %%xmm6\n\t" | ||
213 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
214 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
215 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
216 | " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " | ||
217 | "to memory\n\t" | ||
218 | : | ||
219 | : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) | ||
220 | : "rax", "r8", "r9", "r10"); | ||
221 | |||
222 | |||
223 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
224 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
225 | } | ||
226 | |||
227 | 2 | return; | |
228 | } | ||
229 | |||
230 | #endif /* LV_HAVE_SSE && LV_HAVE_64 */ | ||
231 | |||
232 | |||
233 | #ifdef LV_HAVE_SSE3 | ||
234 | |||
235 | #include <pmmintrin.h> | ||
236 | |||
237 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, | |
238 | const lv_32fc_t* input, | ||
239 | const lv_32fc_t* taps, | ||
240 | unsigned int num_points) | ||
241 | { | ||
242 | |||
243 | lv_32fc_t dotProduct; | ||
244 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
245 | |||
246 | 2 | unsigned int number = 0; | |
247 | 2 | const unsigned int halfPoints = num_points / 2; | |
248 | 2 | unsigned int isodd = num_points & 1; | |
249 | |||
250 | __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
251 | |||
252 | 2 | const lv_32fc_t* a = input; | |
253 | 2 | const lv_32fc_t* b = taps; | |
254 | |||
255 | 2 | dotProdVal = _mm_setzero_ps(); | |
256 | |||
257 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
258 | |||
259 | 131070 | x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
260 | 131070 | y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
261 | |||
262 | 131070 | yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | |
263 | 131070 | yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | |
264 | |||
265 | 131070 | tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | |
266 | |||
267 | 131070 | x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br | |
268 | |||
269 | 131070 | tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
270 | |||
271 | 131070 | z = _mm_addsub_ps(tmp1, | |
272 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
273 | |||
274 | dotProdVal = | ||
275 | 131070 | _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together | |
276 | |||
277 | 131070 | a += 2; | |
278 | 131070 | b += 2; | |
279 | } | ||
280 | |||
281 | __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; | ||
282 | |||
283 | _mm_storeu_ps((float*)dotProductVector, | ||
284 | dotProdVal); // Store the results back into the dot product vector | ||
285 | |||
286 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1]); | |
287 | |||
288 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
289 | 2 | dotProduct += input[num_points - 1] * taps[num_points - 1]; | |
290 | } | ||
291 | |||
292 | 2 | *result = dotProduct; | |
293 | 2 | } | |
294 | |||
295 | #endif /*LV_HAVE_SSE3*/ | ||
296 | |||
297 | // #ifdef LV_HAVE_SSE4_1 | ||
298 | |||
299 | // #include <smmintrin.h> | ||
300 | |||
301 | // static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, | ||
302 | // const lv_32fc_t* input, | ||
303 | // const lv_32fc_t* taps, | ||
304 | // unsigned int num_points) | ||
305 | // { | ||
306 | |||
307 | // unsigned int i = 0; | ||
308 | // const unsigned int qtr_points = num_points / 4; | ||
309 | // const unsigned int isodd = num_points & 3; | ||
310 | |||
311 | // __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; | ||
312 | // float *p_input, *p_taps; | ||
313 | // __m64* p_result; | ||
314 | |||
315 | // p_result = (__m64*)result; | ||
316 | // p_input = (float*)input; | ||
317 | // p_taps = (float*)taps; | ||
318 | |||
319 | // static const __m128i neg = { 0x000000000000000080000000 }; | ||
320 | |||
321 | // real0 = _mm_setzero_ps(); | ||
322 | // real1 = _mm_setzero_ps(); | ||
323 | // im0 = _mm_setzero_ps(); | ||
324 | // im1 = _mm_setzero_ps(); | ||
325 | |||
326 | // for (; i < qtr_points; ++i) { | ||
327 | // xmm0 = _mm_loadu_ps(p_input); | ||
328 | // xmm1 = _mm_loadu_ps(p_taps); | ||
329 | |||
330 | // p_input += 4; | ||
331 | // p_taps += 4; | ||
332 | |||
333 | // xmm2 = _mm_loadu_ps(p_input); | ||
334 | // xmm3 = _mm_loadu_ps(p_taps); | ||
335 | |||
336 | // p_input += 4; | ||
337 | // p_taps += 4; | ||
338 | |||
339 | // xmm4 = _mm_unpackhi_ps(xmm0, xmm2); | ||
340 | // xmm5 = _mm_unpackhi_ps(xmm1, xmm3); | ||
341 | // xmm0 = _mm_unpacklo_ps(xmm0, xmm2); | ||
342 | // xmm2 = _mm_unpacklo_ps(xmm1, xmm3); | ||
343 | |||
344 | // // imaginary vector from input | ||
345 | // xmm1 = _mm_unpackhi_ps(xmm0, xmm4); | ||
346 | // // real vector from input | ||
347 | // xmm3 = _mm_unpacklo_ps(xmm0, xmm4); | ||
348 | // // imaginary vector from taps | ||
349 | // xmm0 = _mm_unpackhi_ps(xmm2, xmm5); | ||
350 | // // real vector from taps | ||
351 | // xmm2 = _mm_unpacklo_ps(xmm2, xmm5); | ||
352 | |||
353 | // xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); | ||
354 | // xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); | ||
355 | |||
356 | // xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); | ||
357 | // xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); | ||
358 | |||
359 | // real0 = _mm_add_ps(xmm4, real0); | ||
360 | // real1 = _mm_add_ps(xmm5, real1); | ||
361 | // im0 = _mm_add_ps(xmm6, im0); | ||
362 | // im1 = _mm_add_ps(xmm7, im1); | ||
363 | // } | ||
364 | |||
365 | // real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); | ||
366 | |||
367 | // im0 = _mm_add_ps(im0, im1); | ||
368 | // real0 = _mm_add_ps(real0, real1); | ||
369 | |||
370 | // im0 = _mm_add_ps(im0, real0); | ||
371 | |||
372 | // _mm_storel_pi(p_result, im0); | ||
373 | |||
374 | // for (i = num_points - isodd; i < num_points; i++) { | ||
375 | // *result += input[i] * taps[i]; | ||
376 | // } | ||
377 | // } | ||
378 | |||
379 | // #endif /*LV_HAVE_SSE4_1*/ | ||
380 | |||
381 | #ifdef LV_HAVE_AVX | ||
382 | |||
383 | #include <immintrin.h> | ||
384 | |||
385 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, | |
386 | const lv_32fc_t* input, | ||
387 | const lv_32fc_t* taps, | ||
388 | unsigned int num_points) | ||
389 | { | ||
390 | |||
391 | 2 | unsigned int isodd = num_points & 3; | |
392 | 2 | unsigned int i = 0; | |
393 | lv_32fc_t dotProduct; | ||
394 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
395 | |||
396 | 2 | unsigned int number = 0; | |
397 | 2 | const unsigned int quarterPoints = num_points / 4; | |
398 | |||
399 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
400 | |||
401 | 2 | const lv_32fc_t* a = input; | |
402 | 2 | const lv_32fc_t* b = taps; | |
403 | |||
404 | 2 | dotProdVal = _mm256_setzero_ps(); | |
405 | |||
406 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
407 | 65534 | x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
408 | 65534 | y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
409 | |||
410 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
411 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
412 | |||
413 | 65534 | tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... | |
414 | |||
415 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
416 | |||
417 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
418 | |||
419 | 65534 | z = _mm256_addsub_ps(tmp1, | |
420 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
421 | |||
422 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
423 | z); // Add the complex multiplication results together | ||
424 | |||
425 | 65534 | a += 4; | |
426 | 65534 | b += 4; | |
427 | } | ||
428 | |||
429 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
430 | |||
431 | _mm256_storeu_ps((float*)dotProductVector, | ||
432 | dotProdVal); // Store the results back into the dot product vector | ||
433 | |||
434 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
435 | 2 | dotProductVector[3]); | |
436 | |||
437 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
438 | 6 | dotProduct += input[i] * taps[i]; | |
439 | } | ||
440 | |||
441 | 2 | *result = dotProduct; | |
442 | 2 | } | |
443 | |||
444 | #endif /*LV_HAVE_AVX*/ | ||
445 | |||
446 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
447 | #include <immintrin.h> | ||
448 | |||
449 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, | |
450 | const lv_32fc_t* input, | ||
451 | const lv_32fc_t* taps, | ||
452 | unsigned int num_points) | ||
453 | { | ||
454 | |||
455 | 2 | unsigned int isodd = num_points & 3; | |
456 | 2 | unsigned int i = 0; | |
457 | lv_32fc_t dotProduct; | ||
458 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
459 | |||
460 | 2 | unsigned int number = 0; | |
461 | 2 | const unsigned int quarterPoints = num_points / 4; | |
462 | |||
463 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
464 | |||
465 | 2 | const lv_32fc_t* a = input; | |
466 | 2 | const lv_32fc_t* b = taps; | |
467 | |||
468 | 2 | dotProdVal = _mm256_setzero_ps(); | |
469 | |||
470 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
471 | |||
472 | 65534 | x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
473 | 65534 | y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
474 | |||
475 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
476 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
477 | |||
478 | 65534 | tmp1 = x; | |
479 | |||
480 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
481 | |||
482 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
483 | |||
484 | 65534 | z = _mm256_fmaddsub_ps( | |
485 | tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
486 | |||
487 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
488 | z); // Add the complex multiplication results together | ||
489 | |||
490 | 65534 | a += 4; | |
491 | 65534 | b += 4; | |
492 | } | ||
493 | |||
494 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
495 | |||
496 | _mm256_storeu_ps((float*)dotProductVector, | ||
497 | dotProdVal); // Store the results back into the dot product vector | ||
498 | |||
499 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
500 | 2 | dotProductVector[3]); | |
501 | |||
502 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
503 | 6 | dotProduct += input[i] * taps[i]; | |
504 | } | ||
505 | |||
506 | 2 | *result = dotProduct; | |
507 | 2 | } | |
508 | |||
509 | #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ | ||
510 | |||
511 | #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ | ||
512 | |||
513 | #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H | ||
514 | #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H | ||
515 | |||
516 | #include <stdio.h> | ||
517 | #include <string.h> | ||
518 | #include <volk/volk_common.h> | ||
519 | #include <volk/volk_complex.h> | ||
520 | |||
521 | |||
522 | #ifdef LV_HAVE_GENERIC | ||
523 | |||
524 | |||
525 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, | |
526 | const lv_32fc_t* input, | ||
527 | const lv_32fc_t* taps, | ||
528 | unsigned int num_points) | ||
529 | { | ||
530 | |||
531 | 2 | const unsigned int num_bytes = num_points * 8; | |
532 | |||
533 | 2 | float* res = (float*)result; | |
534 | 2 | float* in = (float*)input; | |
535 | 2 | float* tp = (float*)taps; | |
536 | 2 | unsigned int n_2_ccomplex_blocks = num_bytes >> 4; | |
537 | |||
538 | 2 | float sum0[2] = { 0, 0 }; | |
539 | 2 | float sum1[2] = { 0, 0 }; | |
540 | 2 | unsigned int i = 0; | |
541 | |||
542 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
543 | 131070 | sum0[0] += in[0] * tp[0] - in[1] * tp[1]; | |
544 | 131070 | sum0[1] += in[0] * tp[1] + in[1] * tp[0]; | |
545 | 131070 | sum1[0] += in[2] * tp[2] - in[3] * tp[3]; | |
546 | 131070 | sum1[1] += in[2] * tp[3] + in[3] * tp[2]; | |
547 | |||
548 | 131070 | in += 4; | |
549 | 131070 | tp += 4; | |
550 | } | ||
551 | |||
552 | 2 | res[0] = sum0[0] + sum1[0]; | |
553 | 2 | res[1] = sum0[1] + sum1[1]; | |
554 | |||
555 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1) { |
556 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
557 | } | ||
558 | 2 | } | |
559 | |||
560 | #endif /*LV_HAVE_GENERIC*/ | ||
561 | |||
562 | |||
563 | #if LV_HAVE_SSE && LV_HAVE_64 | ||
564 | |||
565 | |||
566 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, | |
567 | const lv_32fc_t* input, | ||
568 | const lv_32fc_t* taps, | ||
569 | unsigned int num_points) | ||
570 | { | ||
571 | |||
572 | 2 | const unsigned int num_bytes = num_points * 8; | |
573 | 2 | unsigned int isodd = num_points & 1; | |
574 | |||
575 | 2 | __VOLK_ASM( | |
576 | "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" | ||
577 | "# const float *taps, unsigned num_bytes)\n\t" | ||
578 | "# float sum0 = 0;\n\t" | ||
579 | "# float sum1 = 0;\n\t" | ||
580 | "# float sum2 = 0;\n\t" | ||
581 | "# float sum3 = 0;\n\t" | ||
582 | "# do {\n\t" | ||
583 | "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" | ||
584 | "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" | ||
585 | "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" | ||
586 | "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" | ||
587 | "# input += 4;\n\t" | ||
588 | "# taps += 4; \n\t" | ||
589 | "# } while (--n_2_ccomplex_blocks != 0);\n\t" | ||
590 | "# result[0] = sum0 + sum2;\n\t" | ||
591 | "# result[1] = sum1 + sum3;\n\t" | ||
592 | "# TODO: prefetch and better scheduling\n\t" | ||
593 | " xor %%r9, %%r9\n\t" | ||
594 | " xor %%r10, %%r10\n\t" | ||
595 | " movq %%rcx, %%rax\n\t" | ||
596 | " movq %%rcx, %%r8\n\t" | ||
597 | " movq %[rsi], %%r9\n\t" | ||
598 | " movq %[rdx], %%r10\n\t" | ||
599 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
600 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
601 | " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" | ||
602 | " shr $4, %%r8\n\t" | ||
603 | " jmp .%=L1_test\n\t" | ||
604 | " # 4 taps / loop\n\t" | ||
605 | " # something like ?? cycles / loop\n\t" | ||
606 | ".%=Loop1: \n\t" | ||
607 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
608 | "# movaps (%%r9), %%xmmA\n\t" | ||
609 | "# movaps (%%r10), %%xmmB\n\t" | ||
610 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
611 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
612 | "# mulps %%xmmB, %%xmmA\n\t" | ||
613 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
614 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
615 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
616 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
617 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
618 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
619 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
620 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
621 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
622 | "# addps %%xmmZ, %%xmmA\n\t" | ||
623 | "# addps %%xmmA, %%xmmC\n\t" | ||
624 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
625 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
626 | " movaps 0(%%r9), %%xmm0\n\t" | ||
627 | " movaps 16(%%r9), %%xmm1\n\t" | ||
628 | " movaps %%xmm0, %%xmm4\n\t" | ||
629 | " movaps 0(%%r10), %%xmm2\n\t" | ||
630 | " mulps %%xmm2, %%xmm0\n\t" | ||
631 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
632 | " movaps 16(%%r10), %%xmm3\n\t" | ||
633 | " movaps %%xmm1, %%xmm5\n\t" | ||
634 | " addps %%xmm0, %%xmm6\n\t" | ||
635 | " mulps %%xmm3, %%xmm1\n\t" | ||
636 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
637 | " addps %%xmm1, %%xmm6\n\t" | ||
638 | " mulps %%xmm4, %%xmm2\n\t" | ||
639 | " addps %%xmm2, %%xmm7\n\t" | ||
640 | " mulps %%xmm5, %%xmm3\n\t" | ||
641 | " add $32, %%r9\n\t" | ||
642 | " addps %%xmm3, %%xmm7\n\t" | ||
643 | " add $32, %%r10\n\t" | ||
644 | ".%=L1_test:\n\t" | ||
645 | " dec %%rax\n\t" | ||
646 | " jge .%=Loop1\n\t" | ||
647 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
648 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
649 | " # If so, we've got 2 more taps to do.\n\t" | ||
650 | " and $1, %%r8\n\t" | ||
651 | " je .%=Leven\n\t" | ||
652 | " # The count was odd, do 2 more taps.\n\t" | ||
653 | " movaps 0(%%r9), %%xmm0\n\t" | ||
654 | " movaps %%xmm0, %%xmm4\n\t" | ||
655 | " movaps 0(%%r10), %%xmm2\n\t" | ||
656 | " mulps %%xmm2, %%xmm0\n\t" | ||
657 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
658 | " addps %%xmm0, %%xmm6\n\t" | ||
659 | " mulps %%xmm4, %%xmm2\n\t" | ||
660 | " addps %%xmm2, %%xmm7\n\t" | ||
661 | ".%=Leven:\n\t" | ||
662 | " # neg inversor\n\t" | ||
663 | " xorps %%xmm1, %%xmm1\n\t" | ||
664 | " mov $0x80000000, %%r9\n\t" | ||
665 | " movd %%r9, %%xmm1\n\t" | ||
666 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
667 | " # pfpnacc\n\t" | ||
668 | " xorps %%xmm1, %%xmm6\n\t" | ||
669 | " movaps %%xmm6, %%xmm2\n\t" | ||
670 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
671 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
672 | " movaps %%xmm2, %%xmm3\n\t" | ||
673 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
674 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
675 | " addps %%xmm2, %%xmm6\n\t" | ||
676 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
677 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
678 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
679 | " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " | ||
680 | "to memory\n\t" | ||
681 | : | ||
682 | : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) | ||
683 | : "rax", "r8", "r9", "r10"); | ||
684 | |||
685 | |||
686 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
687 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
688 | } | ||
689 | |||
690 | 2 | return; | |
691 | } | ||
692 | |||
693 | #endif | ||
694 | |||
695 | #if LV_HAVE_SSE && LV_HAVE_32 | ||
696 | |||
697 | static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, | ||
698 | const lv_32fc_t* input, | ||
699 | const lv_32fc_t* taps, | ||
700 | unsigned int num_points) | ||
701 | { | ||
702 | |||
703 | volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); | ||
704 | |||
705 | #if 0 | ||
706 | const unsigned int num_bytes = num_points*8; | ||
707 | unsigned int isodd = num_points & 1; | ||
708 | |||
709 | __VOLK_ASM __VOLK_VOLATILE | ||
710 | ( | ||
711 | " #pushl %%ebp\n\t" | ||
712 | " #movl %%esp, %%ebp\n\t" | ||
713 | " movl 12(%%ebp), %%eax # input\n\t" | ||
714 | " movl 16(%%ebp), %%edx # taps\n\t" | ||
715 | " movl 20(%%ebp), %%ecx # n_bytes\n\t" | ||
716 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
717 | " movaps 0(%%eax), %%xmm0\n\t" | ||
718 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
719 | " movaps 0(%%edx), %%xmm2\n\t" | ||
720 | " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" | ||
721 | " jmp .%=L1_test\n\t" | ||
722 | " # 4 taps / loop\n\t" | ||
723 | " # something like ?? cycles / loop\n\t" | ||
724 | ".%=Loop1: \n\t" | ||
725 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
726 | "# movaps (%%eax), %%xmmA\n\t" | ||
727 | "# movaps (%%edx), %%xmmB\n\t" | ||
728 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
729 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
730 | "# mulps %%xmmB, %%xmmA\n\t" | ||
731 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
732 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
733 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
734 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
735 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
736 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
737 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
738 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
739 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
740 | "# addps %%xmmZ, %%xmmA\n\t" | ||
741 | "# addps %%xmmA, %%xmmC\n\t" | ||
742 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
743 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
744 | " movaps 16(%%eax), %%xmm1\n\t" | ||
745 | " movaps %%xmm0, %%xmm4\n\t" | ||
746 | " mulps %%xmm2, %%xmm0\n\t" | ||
747 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
748 | " movaps 16(%%edx), %%xmm3\n\t" | ||
749 | " movaps %%xmm1, %%xmm5\n\t" | ||
750 | " addps %%xmm0, %%xmm6\n\t" | ||
751 | " mulps %%xmm3, %%xmm1\n\t" | ||
752 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
753 | " addps %%xmm1, %%xmm6\n\t" | ||
754 | " mulps %%xmm4, %%xmm2\n\t" | ||
755 | " movaps 32(%%eax), %%xmm0\n\t" | ||
756 | " addps %%xmm2, %%xmm7\n\t" | ||
757 | " mulps %%xmm5, %%xmm3\n\t" | ||
758 | " addl $32, %%eax\n\t" | ||
759 | " movaps 32(%%edx), %%xmm2\n\t" | ||
760 | " addps %%xmm3, %%xmm7\n\t" | ||
761 | " addl $32, %%edx\n\t" | ||
762 | ".%=L1_test:\n\t" | ||
763 | " decl %%ecx\n\t" | ||
764 | " jge .%=Loop1\n\t" | ||
765 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
766 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
767 | " # If so, we've got 2 more taps to do.\n\t" | ||
768 | " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" | ||
769 | " shrl $4, %%ecx\n\t" | ||
770 | " andl $1, %%ecx\n\t" | ||
771 | " je .%=Leven\n\t" | ||
772 | " # The count was odd, do 2 more taps.\n\t" | ||
773 | " # Note that we've already got mm0/mm2 preloaded\n\t" | ||
774 | " # from the main loop.\n\t" | ||
775 | " movaps %%xmm0, %%xmm4\n\t" | ||
776 | " mulps %%xmm2, %%xmm0\n\t" | ||
777 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
778 | " addps %%xmm0, %%xmm6\n\t" | ||
779 | " mulps %%xmm4, %%xmm2\n\t" | ||
780 | " addps %%xmm2, %%xmm7\n\t" | ||
781 | ".%=Leven:\n\t" | ||
782 | " # neg inversor\n\t" | ||
783 | " movl 8(%%ebp), %%eax \n\t" | ||
784 | " xorps %%xmm1, %%xmm1\n\t" | ||
785 | " movl $0x80000000, (%%eax)\n\t" | ||
786 | " movss (%%eax), %%xmm1\n\t" | ||
787 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
788 | " # pfpnacc\n\t" | ||
789 | " xorps %%xmm1, %%xmm6\n\t" | ||
790 | " movaps %%xmm6, %%xmm2\n\t" | ||
791 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
792 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
793 | " movaps %%xmm2, %%xmm3\n\t" | ||
794 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
795 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
796 | " addps %%xmm2, %%xmm6\n\t" | ||
797 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
798 | " #movl 8(%%ebp), %%eax # @result\n\t" | ||
799 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
800 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
801 | " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" | ||
802 | " #popl %%ebp\n\t" | ||
803 | : | ||
804 | : | ||
805 | : "eax", "ecx", "edx" | ||
806 | ); | ||
807 | |||
808 | |||
809 | int getem = num_bytes % 16; | ||
810 | |||
811 | if(isodd) { | ||
812 | *result += (input[num_points - 1] * taps[num_points - 1]); | ||
813 | } | ||
814 | |||
815 | return; | ||
816 | #endif | ||
817 | } | ||
818 | |||
819 | #endif /*LV_HAVE_SSE*/ | ||
820 | |||
821 | #ifdef LV_HAVE_SSE3 | ||
822 | |||
823 | #include <pmmintrin.h> | ||
824 | |||
825 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, | |
826 | const lv_32fc_t* input, | ||
827 | const lv_32fc_t* taps, | ||
828 | unsigned int num_points) | ||
829 | { | ||
830 | |||
831 | 2 | const unsigned int num_bytes = num_points * 8; | |
832 | 2 | unsigned int isodd = num_points & 1; | |
833 | |||
834 | lv_32fc_t dotProduct; | ||
835 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
836 | |||
837 | 2 | unsigned int number = 0; | |
838 | 2 | const unsigned int halfPoints = num_bytes >> 4; | |
839 | |||
840 | __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
841 | |||
842 | 2 | const lv_32fc_t* a = input; | |
843 | 2 | const lv_32fc_t* b = taps; | |
844 | |||
845 | 2 | dotProdVal = _mm_setzero_ps(); | |
846 | |||
847 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
848 | |||
849 | 131070 | x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
850 | 131070 | y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
851 | |||
852 | 131070 | yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | |
853 | 131070 | yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | |
854 | |||
855 | 131070 | tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | |
856 | |||
857 | 131070 | x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br | |
858 | |||
859 | 131070 | tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
860 | |||
861 | 131070 | z = _mm_addsub_ps(tmp1, | |
862 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
863 | |||
864 | dotProdVal = | ||
865 | 131070 | _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together | |
866 | |||
867 | 131070 | a += 2; | |
868 | 131070 | b += 2; | |
869 | } | ||
870 | |||
871 | __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; | ||
872 | |||
873 | _mm_store_ps((float*)dotProductVector, | ||
874 | dotProdVal); // Store the results back into the dot product vector | ||
875 | |||
876 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1]); | |
877 | |||
878 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
879 | 2 | dotProduct += input[num_points - 1] * taps[num_points - 1]; | |
880 | } | ||
881 | |||
882 | 2 | *result = dotProduct; | |
883 | 2 | } | |
884 | |||
885 | #endif /*LV_HAVE_SSE3*/ | ||
886 | |||
887 | |||
888 | // #ifdef LV_HAVE_SSE4_1 | ||
889 | |||
890 | // #include <smmintrin.h> | ||
891 | |||
892 | // static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, | ||
893 | // const lv_32fc_t* input, | ||
894 | // const lv_32fc_t* taps, | ||
895 | // unsigned int num_points) | ||
896 | // { | ||
897 | |||
898 | // unsigned int i = 0; | ||
899 | // const unsigned int qtr_points = num_points / 4; | ||
900 | // const unsigned int isodd = num_points & 3; | ||
901 | |||
902 | // __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; | ||
903 | // float *p_input, *p_taps; | ||
904 | // __m64* p_result; | ||
905 | |||
906 | // static const __m128i neg = { 0x000000000000000080000000 }; | ||
907 | |||
908 | // p_result = (__m64*)result; | ||
909 | // p_input = (float*)input; | ||
910 | // p_taps = (float*)taps; | ||
911 | |||
912 | // real0 = _mm_setzero_ps(); | ||
913 | // real1 = _mm_setzero_ps(); | ||
914 | // im0 = _mm_setzero_ps(); | ||
915 | // im1 = _mm_setzero_ps(); | ||
916 | |||
917 | // for (; i < qtr_points; ++i) { | ||
918 | // xmm0 = _mm_load_ps(p_input); | ||
919 | // xmm1 = _mm_load_ps(p_taps); | ||
920 | |||
921 | // p_input += 4; | ||
922 | // p_taps += 4; | ||
923 | |||
924 | // xmm2 = _mm_load_ps(p_input); | ||
925 | // xmm3 = _mm_load_ps(p_taps); | ||
926 | |||
927 | // p_input += 4; | ||
928 | // p_taps += 4; | ||
929 | |||
930 | // xmm4 = _mm_unpackhi_ps(xmm0, xmm2); | ||
931 | // xmm5 = _mm_unpackhi_ps(xmm1, xmm3); | ||
932 | // xmm0 = _mm_unpacklo_ps(xmm0, xmm2); | ||
933 | // xmm2 = _mm_unpacklo_ps(xmm1, xmm3); | ||
934 | |||
935 | // // imaginary vector from input | ||
936 | // xmm1 = _mm_unpackhi_ps(xmm0, xmm4); | ||
937 | // // real vector from input | ||
938 | // xmm3 = _mm_unpacklo_ps(xmm0, xmm4); | ||
939 | // // imaginary vector from taps | ||
940 | // xmm0 = _mm_unpackhi_ps(xmm2, xmm5); | ||
941 | // // real vector from taps | ||
942 | // xmm2 = _mm_unpacklo_ps(xmm2, xmm5); | ||
943 | |||
944 | // xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); | ||
945 | // xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); | ||
946 | |||
947 | // xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); | ||
948 | // xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); | ||
949 | |||
950 | // real0 = _mm_add_ps(xmm4, real0); | ||
951 | // real1 = _mm_add_ps(xmm5, real1); | ||
952 | // im0 = _mm_add_ps(xmm6, im0); | ||
953 | // im1 = _mm_add_ps(xmm7, im1); | ||
954 | // } | ||
955 | |||
956 | // real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); | ||
957 | |||
958 | // im0 = _mm_add_ps(im0, im1); | ||
959 | // real0 = _mm_add_ps(real0, real1); | ||
960 | |||
961 | // im0 = _mm_add_ps(im0, real0); | ||
962 | |||
963 | // _mm_storel_pi(p_result, im0); | ||
964 | |||
965 | // for (i = num_points - isodd; i < num_points; i++) { | ||
966 | // *result += input[i] * taps[i]; | ||
967 | // } | ||
968 | // } | ||
969 | |||
970 | // #endif /*LV_HAVE_SSE4_1*/ | ||
971 | |||
972 | #ifdef LV_HAVE_NEON | ||
973 | #include <arm_neon.h> | ||
974 | |||
975 | static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, | ||
976 | const lv_32fc_t* input, | ||
977 | const lv_32fc_t* taps, | ||
978 | unsigned int num_points) | ||
979 | { | ||
980 | |||
981 | unsigned int quarter_points = num_points / 4; | ||
982 | unsigned int number; | ||
983 | |||
984 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
985 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
986 | // for 2-lane vectors, 1st lane holds the real part, | ||
987 | // 2nd lane holds the imaginary part | ||
988 | float32x4x2_t a_val, b_val, c_val, accumulator; | ||
989 | float32x4x2_t tmp_real, tmp_imag; | ||
990 | accumulator.val[0] = vdupq_n_f32(0); | ||
991 | accumulator.val[1] = vdupq_n_f32(0); | ||
992 | |||
993 | for (number = 0; number < quarter_points; ++number) { | ||
994 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
995 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
996 | __VOLK_PREFETCH(a_ptr + 8); | ||
997 | __VOLK_PREFETCH(b_ptr + 8); | ||
998 | |||
999 | // multiply the real*real and imag*imag to get real result | ||
1000 | // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||
1001 | tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
1002 | // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i | ||
1003 | tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); | ||
1004 | |||
1005 | // Multiply cross terms to get the imaginary result | ||
1006 | // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i | ||
1007 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); | ||
1008 | // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r | ||
1009 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
1010 | |||
1011 | c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); | ||
1012 | c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); | ||
1013 | |||
1014 | accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]); | ||
1015 | accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]); | ||
1016 | |||
1017 | a_ptr += 4; | ||
1018 | b_ptr += 4; | ||
1019 | } | ||
1020 | lv_32fc_t accum_result[4]; | ||
1021 | vst2q_f32((float*)accum_result, accumulator); | ||
1022 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
1023 | |||
1024 | // tail case | ||
1025 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
1026 | *result += (*a_ptr++) * (*b_ptr++); | ||
1027 | } | ||
1028 | } | ||
1029 | #endif /*LV_HAVE_NEON*/ | ||
1030 | |||
1031 | #ifdef LV_HAVE_NEON | ||
1032 | #include <arm_neon.h> | ||
1033 | static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, | ||
1034 | const lv_32fc_t* input, | ||
1035 | const lv_32fc_t* taps, | ||
1036 | unsigned int num_points) | ||
1037 | { | ||
1038 | |||
1039 | unsigned int quarter_points = num_points / 4; | ||
1040 | unsigned int number; | ||
1041 | |||
1042 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
1043 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
1044 | // for 2-lane vectors, 1st lane holds the real part, | ||
1045 | // 2nd lane holds the imaginary part | ||
1046 | float32x4x2_t a_val, b_val, accumulator; | ||
1047 | float32x4x2_t tmp_imag; | ||
1048 | accumulator.val[0] = vdupq_n_f32(0); | ||
1049 | accumulator.val[1] = vdupq_n_f32(0); | ||
1050 | |||
1051 | for (number = 0; number < quarter_points; ++number) { | ||
1052 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
1053 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
1054 | __VOLK_PREFETCH(a_ptr + 8); | ||
1055 | __VOLK_PREFETCH(b_ptr + 8); | ||
1056 | |||
1057 | // do the first multiply | ||
1058 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
1059 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
1060 | |||
1061 | // use multiply accumulate/subtract to get result | ||
1062 | tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); | ||
1063 | tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); | ||
1064 | |||
1065 | accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]); | ||
1066 | accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]); | ||
1067 | |||
1068 | // increment pointers | ||
1069 | a_ptr += 4; | ||
1070 | b_ptr += 4; | ||
1071 | } | ||
1072 | lv_32fc_t accum_result[4]; | ||
1073 | vst2q_f32((float*)accum_result, accumulator); | ||
1074 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
1075 | |||
1076 | // tail case | ||
1077 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
1078 | *result += (*a_ptr++) * (*b_ptr++); | ||
1079 | } | ||
1080 | } | ||
1081 | #endif /*LV_HAVE_NEON*/ | ||
1082 | |||
1083 | #ifdef LV_HAVE_NEON | ||
1084 | static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, | ||
1085 | const lv_32fc_t* input, | ||
1086 | const lv_32fc_t* taps, | ||
1087 | unsigned int num_points) | ||
1088 | { | ||
1089 | |||
1090 | unsigned int quarter_points = num_points / 4; | ||
1091 | unsigned int number; | ||
1092 | |||
1093 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
1094 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
1095 | // for 2-lane vectors, 1st lane holds the real part, | ||
1096 | // 2nd lane holds the imaginary part | ||
1097 | float32x4x2_t a_val, b_val, accumulator1, accumulator2; | ||
1098 | accumulator1.val[0] = vdupq_n_f32(0); | ||
1099 | accumulator1.val[1] = vdupq_n_f32(0); | ||
1100 | accumulator2.val[0] = vdupq_n_f32(0); | ||
1101 | accumulator2.val[1] = vdupq_n_f32(0); | ||
1102 | |||
1103 | for (number = 0; number < quarter_points; ++number) { | ||
1104 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
1105 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
1106 | __VOLK_PREFETCH(a_ptr + 8); | ||
1107 | __VOLK_PREFETCH(b_ptr + 8); | ||
1108 | |||
1109 | // use 2 accumulators to remove inter-instruction data dependencies | ||
1110 | accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); | ||
1111 | accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]); | ||
1112 | accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]); | ||
1113 | accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]); | ||
1114 | // increment pointers | ||
1115 | a_ptr += 4; | ||
1116 | b_ptr += 4; | ||
1117 | } | ||
1118 | accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]); | ||
1119 | accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]); | ||
1120 | lv_32fc_t accum_result[4]; | ||
1121 | vst2q_f32((float*)accum_result, accumulator1); | ||
1122 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
1123 | |||
1124 | // tail case | ||
1125 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
1126 | *result += (*a_ptr++) * (*b_ptr++); | ||
1127 | } | ||
1128 | } | ||
1129 | #endif /*LV_HAVE_NEON*/ | ||
1130 | |||
1131 | #ifdef LV_HAVE_NEON | ||
1132 | static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, | ||
1133 | const lv_32fc_t* input, | ||
1134 | const lv_32fc_t* taps, | ||
1135 | unsigned int num_points) | ||
1136 | { | ||
1137 | // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very | ||
1138 | // fast | ||
1139 | |||
1140 | unsigned int quarter_points = num_points / 8; | ||
1141 | unsigned int number; | ||
1142 | |||
1143 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
1144 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
1145 | // for 2-lane vectors, 1st lane holds the real part, | ||
1146 | // 2nd lane holds the imaginary part | ||
1147 | float32x4x4_t a_val, b_val, accumulator1, accumulator2; | ||
1148 | float32x4x2_t reduced_accumulator; | ||
1149 | accumulator1.val[0] = vdupq_n_f32(0); | ||
1150 | accumulator1.val[1] = vdupq_n_f32(0); | ||
1151 | accumulator1.val[2] = vdupq_n_f32(0); | ||
1152 | accumulator1.val[3] = vdupq_n_f32(0); | ||
1153 | accumulator2.val[0] = vdupq_n_f32(0); | ||
1154 | accumulator2.val[1] = vdupq_n_f32(0); | ||
1155 | accumulator2.val[2] = vdupq_n_f32(0); | ||
1156 | accumulator2.val[3] = vdupq_n_f32(0); | ||
1157 | |||
1158 | // 8 input regs, 8 accumulators -> 16/16 neon regs are used | ||
1159 | for (number = 0; number < quarter_points; ++number) { | ||
1160 | a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
1161 | b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
1162 | __VOLK_PREFETCH(a_ptr + 8); | ||
1163 | __VOLK_PREFETCH(b_ptr + 8); | ||
1164 | |||
1165 | // use 2 accumulators to remove inter-instruction data dependencies | ||
1166 | accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); | ||
1167 | accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]); | ||
1168 | |||
1169 | accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]); | ||
1170 | accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]); | ||
1171 | |||
1172 | accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]); | ||
1173 | accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]); | ||
1174 | |||
1175 | accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]); | ||
1176 | accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]); | ||
1177 | // increment pointers | ||
1178 | a_ptr += 8; | ||
1179 | b_ptr += 8; | ||
1180 | } | ||
1181 | // reduce 8 accumulator lanes down to 2 (1 real and 1 imag) | ||
1182 | accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]); | ||
1183 | accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]); | ||
1184 | accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]); | ||
1185 | accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]); | ||
1186 | reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]); | ||
1187 | reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]); | ||
1188 | // now reduce accumulators to scalars | ||
1189 | lv_32fc_t accum_result[4]; | ||
1190 | vst2q_f32((float*)accum_result, reduced_accumulator); | ||
1191 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
1192 | |||
1193 | // tail case | ||
1194 | for (number = quarter_points * 8; number < num_points; ++number) { | ||
1195 | *result += (*a_ptr++) * (*b_ptr++); | ||
1196 | } | ||
1197 | } | ||
1198 | #endif /*LV_HAVE_NEON*/ | ||
1199 | |||
1200 | |||
1201 | #ifdef LV_HAVE_AVX | ||
1202 | |||
1203 | #include <immintrin.h> | ||
1204 | |||
1205 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, | |
1206 | const lv_32fc_t* input, | ||
1207 | const lv_32fc_t* taps, | ||
1208 | unsigned int num_points) | ||
1209 | { | ||
1210 | |||
1211 | 2 | unsigned int isodd = num_points & 3; | |
1212 | 2 | unsigned int i = 0; | |
1213 | lv_32fc_t dotProduct; | ||
1214 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
1215 | |||
1216 | 2 | unsigned int number = 0; | |
1217 | 2 | const unsigned int quarterPoints = num_points / 4; | |
1218 | |||
1219 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
1220 | |||
1221 | 2 | const lv_32fc_t* a = input; | |
1222 | 2 | const lv_32fc_t* b = taps; | |
1223 | |||
1224 | 2 | dotProdVal = _mm256_setzero_ps(); | |
1225 | |||
1226 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
1227 | |||
1228 | 65534 | x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
1229 | 65534 | y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
1230 | |||
1231 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
1232 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
1233 | |||
1234 | 65534 | tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... | |
1235 | |||
1236 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
1237 | |||
1238 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
1239 | |||
1240 | 65534 | z = _mm256_addsub_ps(tmp1, | |
1241 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
1242 | |||
1243 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
1244 | z); // Add the complex multiplication results together | ||
1245 | |||
1246 | 65534 | a += 4; | |
1247 | 65534 | b += 4; | |
1248 | } | ||
1249 | |||
1250 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
1251 | |||
1252 | _mm256_store_ps((float*)dotProductVector, | ||
1253 | dotProdVal); // Store the results back into the dot product vector | ||
1254 | |||
1255 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
1256 | 2 | dotProductVector[3]); | |
1257 | |||
1258 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
1259 | 6 | dotProduct += input[i] * taps[i]; | |
1260 | } | ||
1261 | |||
1262 | 2 | *result = dotProduct; | |
1263 | 2 | } | |
1264 | |||
1265 | #endif /*LV_HAVE_AVX*/ | ||
1266 | |||
1267 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
1268 | #include <immintrin.h> | ||
1269 | |||
1270 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, | |
1271 | const lv_32fc_t* input, | ||
1272 | const lv_32fc_t* taps, | ||
1273 | unsigned int num_points) | ||
1274 | { | ||
1275 | |||
1276 | 2 | unsigned int isodd = num_points & 3; | |
1277 | 2 | unsigned int i = 0; | |
1278 | lv_32fc_t dotProduct; | ||
1279 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
1280 | |||
1281 | 2 | unsigned int number = 0; | |
1282 | 2 | const unsigned int quarterPoints = num_points / 4; | |
1283 | |||
1284 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
1285 | |||
1286 | 2 | const lv_32fc_t* a = input; | |
1287 | 2 | const lv_32fc_t* b = taps; | |
1288 | |||
1289 | 2 | dotProdVal = _mm256_setzero_ps(); | |
1290 | |||
1291 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
1292 | |||
1293 | 65534 | x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
1294 | 65534 | y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
1295 | |||
1296 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
1297 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
1298 | |||
1299 | 65534 | tmp1 = x; | |
1300 | |||
1301 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
1302 | |||
1303 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
1304 | |||
1305 | 65534 | z = _mm256_fmaddsub_ps( | |
1306 | tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
1307 | |||
1308 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
1309 | z); // Add the complex multiplication results together | ||
1310 | |||
1311 | 65534 | a += 4; | |
1312 | 65534 | b += 4; | |
1313 | } | ||
1314 | |||
1315 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
1316 | |||
1317 | _mm256_store_ps((float*)dotProductVector, | ||
1318 | dotProdVal); // Store the results back into the dot product vector | ||
1319 | |||
1320 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
1321 | 2 | dotProductVector[3]); | |
1322 | |||
1323 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
1324 | 6 | dotProduct += input[i] * taps[i]; | |
1325 | } | ||
1326 | |||
1327 | 2 | *result = dotProduct; | |
1328 | 2 | } | |
1329 | |||
1330 | #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ | ||
1331 | |||
1332 | |||
1333 | #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ | ||
1334 |