| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32fc_x2_dot_prod_32fc | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * This block computes the dot product (or inner product) between two | ||
| 16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
| 17 | * num_points taps, the result is the sum of products between the two | ||
| 18 | * vectors. The result is a single value stored in the \p result | ||
| 19 | * address and is returned as a complex float. | ||
| 20 | * | ||
| 21 | * <b>Dispatcher Prototype</b> | ||
| 22 | * \code | ||
| 23 | * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const | ||
| 24 | * lv_32fc_t* taps, unsigned int num_points) \endcode | ||
| 25 | * | ||
| 26 | * \b Inputs | ||
| 27 | * \li input: vector of complex floats. | ||
| 28 | * \li taps: complex float taps. | ||
| 29 | * \li num_points: number of samples in both \p input and \p taps. | ||
| 30 | * | ||
| 31 | * \b Outputs | ||
| 32 | * \li result: pointer to a complex float value to hold the dot product result. | ||
| 33 | * | ||
| 34 | * \b Example | ||
| 35 | * \code | ||
| 36 | * int N = 10000; | ||
| 37 | * | ||
| 38 | * <FIXME> | ||
| 39 | * | ||
| 40 | * volk_32fc_x2_dot_prod_32fc(); | ||
| 41 | * | ||
| 42 | * \endcode | ||
| 43 | */ | ||
| 44 | |||
| 45 | #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H | ||
| 46 | #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H | ||
| 47 | |||
| 48 | #include <stdio.h> | ||
| 49 | #include <string.h> | ||
| 50 | #include <volk/volk_common.h> | ||
| 51 | #include <volk/volk_complex.h> | ||
| 52 | |||
| 53 | |||
| 54 | #ifdef LV_HAVE_RISCV64 | ||
| 55 | extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result, | ||
| 56 | const lv_32fc_t* input, | ||
| 57 | const lv_32fc_t* taps, | ||
| 58 | unsigned int num_points); | ||
| 59 | #endif | ||
| 60 | |||
| 61 | #ifdef LV_HAVE_GENERIC | ||
| 62 | |||
| 63 | |||
| 64 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, | |
| 65 | const lv_32fc_t* input, | ||
| 66 | const lv_32fc_t* taps, | ||
| 67 | unsigned int num_points) | ||
| 68 | { | ||
| 69 | |||
| 70 | 2 | float* res = (float*)result; | |
| 71 | 2 | float* in = (float*)input; | |
| 72 | 2 | float* tp = (float*)taps; | |
| 73 | 2 | unsigned int n_2_ccomplex_blocks = num_points / 2; | |
| 74 | |||
| 75 | 2 | float sum0[2] = { 0, 0 }; | |
| 76 | 2 | float sum1[2] = { 0, 0 }; | |
| 77 | 2 | unsigned int i = 0; | |
| 78 | |||
| 79 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
| 80 | 131070 | sum0[0] += in[0] * tp[0] - in[1] * tp[1]; | |
| 81 | 131070 | sum0[1] += in[0] * tp[1] + in[1] * tp[0]; | |
| 82 | 131070 | sum1[0] += in[2] * tp[2] - in[3] * tp[3]; | |
| 83 | 131070 | sum1[1] += in[2] * tp[3] + in[3] * tp[2]; | |
| 84 | |||
| 85 | 131070 | in += 4; | |
| 86 | 131070 | tp += 4; | |
| 87 | } | ||
| 88 | |||
| 89 | 2 | res[0] = sum0[0] + sum1[0]; | |
| 90 | 2 | res[1] = sum0[1] + sum1[1]; | |
| 91 | |||
| 92 | // Cleanup if we had an odd number of points | ||
| 93 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1) { |
| 94 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
| 95 | } | ||
| 96 | 2 | } | |
| 97 | |||
| 98 | #endif /*LV_HAVE_GENERIC*/ | ||
| 99 | |||
| 100 | |||
| 101 | #if LV_HAVE_SSE && LV_HAVE_64 | ||
| 102 | |||
| 103 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, | |
| 104 | const lv_32fc_t* input, | ||
| 105 | const lv_32fc_t* taps, | ||
| 106 | unsigned int num_points) | ||
| 107 | { | ||
| 108 | |||
| 109 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 110 | 2 | unsigned int isodd = num_points & 1; | |
| 111 | |||
| 112 | 2 | __VOLK_ASM( | |
| 113 | "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" | ||
| 114 | "# const float *taps, unsigned num_bytes)\n\t" | ||
| 115 | "# float sum0 = 0;\n\t" | ||
| 116 | "# float sum1 = 0;\n\t" | ||
| 117 | "# float sum2 = 0;\n\t" | ||
| 118 | "# float sum3 = 0;\n\t" | ||
| 119 | "# do {\n\t" | ||
| 120 | "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" | ||
| 121 | "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" | ||
| 122 | "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" | ||
| 123 | "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" | ||
| 124 | "# input += 4;\n\t" | ||
| 125 | "# taps += 4; \n\t" | ||
| 126 | "# } while (--n_2_ccomplex_blocks != 0);\n\t" | ||
| 127 | "# result[0] = sum0 + sum2;\n\t" | ||
| 128 | "# result[1] = sum1 + sum3;\n\t" | ||
| 129 | "# TODO: prefetch and better scheduling\n\t" | ||
| 130 | " xor %%r9, %%r9\n\t" | ||
| 131 | " xor %%r10, %%r10\n\t" | ||
| 132 | " movq %%rcx, %%rax\n\t" | ||
| 133 | " movq %%rcx, %%r8\n\t" | ||
| 134 | " movq %[rsi], %%r9\n\t" | ||
| 135 | " movq %[rdx], %%r10\n\t" | ||
| 136 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
| 137 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
| 138 | " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" | ||
| 139 | " shr $4, %%r8\n\t" | ||
| 140 | " jmp .%=L1_test\n\t" | ||
| 141 | " # 4 taps / loop\n\t" | ||
| 142 | " # something like ?? cycles / loop\n\t" | ||
| 143 | ".%=Loop1: \n\t" | ||
| 144 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
| 145 | "# movups (%%r9), %%xmmA\n\t" | ||
| 146 | "# movups (%%r10), %%xmmB\n\t" | ||
| 147 | "# movups %%xmmA, %%xmmZ\n\t" | ||
| 148 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
| 149 | "# mulps %%xmmB, %%xmmA\n\t" | ||
| 150 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
| 151 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
| 152 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
| 153 | "# movups %%xmmA, %%xmmZ\n\t" | ||
| 154 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
| 155 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
| 156 | "# movups %%xmmZ, %%xmmY\n\t" | ||
| 157 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
| 158 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
| 159 | "# addps %%xmmZ, %%xmmA\n\t" | ||
| 160 | "# addps %%xmmA, %%xmmC\n\t" | ||
| 161 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
| 162 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
| 163 | " movups 0(%%r9), %%xmm0\n\t" | ||
| 164 | " movups 16(%%r9), %%xmm1\n\t" | ||
| 165 | " movups %%xmm0, %%xmm4\n\t" | ||
| 166 | " movups 0(%%r10), %%xmm2\n\t" | ||
| 167 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 168 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 169 | " movups 16(%%r10), %%xmm3\n\t" | ||
| 170 | " movups %%xmm1, %%xmm5\n\t" | ||
| 171 | " addps %%xmm0, %%xmm6\n\t" | ||
| 172 | " mulps %%xmm3, %%xmm1\n\t" | ||
| 173 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
| 174 | " addps %%xmm1, %%xmm6\n\t" | ||
| 175 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 176 | " addps %%xmm2, %%xmm7\n\t" | ||
| 177 | " mulps %%xmm5, %%xmm3\n\t" | ||
| 178 | " add $32, %%r9\n\t" | ||
| 179 | " addps %%xmm3, %%xmm7\n\t" | ||
| 180 | " add $32, %%r10\n\t" | ||
| 181 | ".%=L1_test:\n\t" | ||
| 182 | " dec %%rax\n\t" | ||
| 183 | " jge .%=Loop1\n\t" | ||
| 184 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
| 185 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
| 186 | " # If so, we've got 2 more taps to do.\n\t" | ||
| 187 | " and $1, %%r8\n\t" | ||
| 188 | " je .%=Leven\n\t" | ||
| 189 | " # The count was odd, do 2 more taps.\n\t" | ||
| 190 | " movups 0(%%r9), %%xmm0\n\t" | ||
| 191 | " movups %%xmm0, %%xmm4\n\t" | ||
| 192 | " movups 0(%%r10), %%xmm2\n\t" | ||
| 193 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 194 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 195 | " addps %%xmm0, %%xmm6\n\t" | ||
| 196 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 197 | " addps %%xmm2, %%xmm7\n\t" | ||
| 198 | ".%=Leven:\n\t" | ||
| 199 | " # neg inversor\n\t" | ||
| 200 | " xorps %%xmm1, %%xmm1\n\t" | ||
| 201 | " mov $0x80000000, %%r9\n\t" | ||
| 202 | " movd %%r9, %%xmm1\n\t" | ||
| 203 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
| 204 | " # pfpnacc\n\t" | ||
| 205 | " xorps %%xmm1, %%xmm6\n\t" | ||
| 206 | " movups %%xmm6, %%xmm2\n\t" | ||
| 207 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
| 208 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
| 209 | " movups %%xmm2, %%xmm3\n\t" | ||
| 210 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
| 211 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
| 212 | " addps %%xmm2, %%xmm6\n\t" | ||
| 213 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
| 214 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
| 215 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
| 216 | " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " | ||
| 217 | "to memory\n\t" | ||
| 218 | : | ||
| 219 | : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) | ||
| 220 | : "rax", "r8", "r9", "r10"); | ||
| 221 | |||
| 222 | |||
| 223 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
| 224 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
| 225 | } | ||
| 226 | |||
| 227 | 2 | return; | |
| 228 | } | ||
| 229 | |||
| 230 | #endif /* LV_HAVE_SSE && LV_HAVE_64 */ | ||
| 231 | |||
| 232 | |||
| 233 | #ifdef LV_HAVE_SSE3 | ||
| 234 | |||
| 235 | #include <pmmintrin.h> | ||
| 236 | |||
| 237 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, | |
| 238 | const lv_32fc_t* input, | ||
| 239 | const lv_32fc_t* taps, | ||
| 240 | unsigned int num_points) | ||
| 241 | { | ||
| 242 | |||
| 243 | lv_32fc_t dotProduct; | ||
| 244 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
| 245 | |||
| 246 | 2 | unsigned int number = 0; | |
| 247 | 2 | const unsigned int halfPoints = num_points / 2; | |
| 248 | 2 | unsigned int isodd = num_points & 1; | |
| 249 | |||
| 250 | __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
| 251 | |||
| 252 | 2 | const lv_32fc_t* a = input; | |
| 253 | 2 | const lv_32fc_t* b = taps; | |
| 254 | |||
| 255 | 2 | dotProdVal = _mm_setzero_ps(); | |
| 256 | |||
| 257 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
| 258 | |||
| 259 | 131070 | x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
| 260 | 131070 | y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
| 261 | |||
| 262 | 131070 | yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | |
| 263 | 131070 | yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | |
| 264 | |||
| 265 | 131070 | tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | |
| 266 | |||
| 267 | 131070 | x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br | |
| 268 | |||
| 269 | 131070 | tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
| 270 | |||
| 271 | 131070 | z = _mm_addsub_ps(tmp1, | |
| 272 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 273 | |||
| 274 | dotProdVal = | ||
| 275 | 131070 | _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together | |
| 276 | |||
| 277 | 131070 | a += 2; | |
| 278 | 131070 | b += 2; | |
| 279 | } | ||
| 280 | |||
| 281 | __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; | ||
| 282 | |||
| 283 | _mm_storeu_ps((float*)dotProductVector, | ||
| 284 | dotProdVal); // Store the results back into the dot product vector | ||
| 285 | |||
| 286 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1]); | |
| 287 | |||
| 288 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
| 289 | 2 | dotProduct += input[num_points - 1] * taps[num_points - 1]; | |
| 290 | } | ||
| 291 | |||
| 292 | 2 | *result = dotProduct; | |
| 293 | 2 | } | |
| 294 | |||
| 295 | #endif /*LV_HAVE_SSE3*/ | ||
| 296 | |||
| 297 | // #ifdef LV_HAVE_SSE4_1 | ||
| 298 | |||
| 299 | // #include <smmintrin.h> | ||
| 300 | |||
| 301 | // static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, | ||
| 302 | // const lv_32fc_t* input, | ||
| 303 | // const lv_32fc_t* taps, | ||
| 304 | // unsigned int num_points) | ||
| 305 | // { | ||
| 306 | |||
| 307 | // unsigned int i = 0; | ||
| 308 | // const unsigned int qtr_points = num_points / 4; | ||
| 309 | // const unsigned int isodd = num_points & 3; | ||
| 310 | |||
| 311 | // __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; | ||
| 312 | // float *p_input, *p_taps; | ||
| 313 | // __m64* p_result; | ||
| 314 | |||
| 315 | // p_result = (__m64*)result; | ||
| 316 | // p_input = (float*)input; | ||
| 317 | // p_taps = (float*)taps; | ||
| 318 | |||
| 319 | // static const __m128i neg = { 0x000000000000000080000000 }; | ||
| 320 | |||
| 321 | // real0 = _mm_setzero_ps(); | ||
| 322 | // real1 = _mm_setzero_ps(); | ||
| 323 | // im0 = _mm_setzero_ps(); | ||
| 324 | // im1 = _mm_setzero_ps(); | ||
| 325 | |||
| 326 | // for (; i < qtr_points; ++i) { | ||
| 327 | // xmm0 = _mm_loadu_ps(p_input); | ||
| 328 | // xmm1 = _mm_loadu_ps(p_taps); | ||
| 329 | |||
| 330 | // p_input += 4; | ||
| 331 | // p_taps += 4; | ||
| 332 | |||
| 333 | // xmm2 = _mm_loadu_ps(p_input); | ||
| 334 | // xmm3 = _mm_loadu_ps(p_taps); | ||
| 335 | |||
| 336 | // p_input += 4; | ||
| 337 | // p_taps += 4; | ||
| 338 | |||
| 339 | // xmm4 = _mm_unpackhi_ps(xmm0, xmm2); | ||
| 340 | // xmm5 = _mm_unpackhi_ps(xmm1, xmm3); | ||
| 341 | // xmm0 = _mm_unpacklo_ps(xmm0, xmm2); | ||
| 342 | // xmm2 = _mm_unpacklo_ps(xmm1, xmm3); | ||
| 343 | |||
| 344 | // // imaginary vector from input | ||
| 345 | // xmm1 = _mm_unpackhi_ps(xmm0, xmm4); | ||
| 346 | // // real vector from input | ||
| 347 | // xmm3 = _mm_unpacklo_ps(xmm0, xmm4); | ||
| 348 | // // imaginary vector from taps | ||
| 349 | // xmm0 = _mm_unpackhi_ps(xmm2, xmm5); | ||
| 350 | // // real vector from taps | ||
| 351 | // xmm2 = _mm_unpacklo_ps(xmm2, xmm5); | ||
| 352 | |||
| 353 | // xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); | ||
| 354 | // xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); | ||
| 355 | |||
| 356 | // xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); | ||
| 357 | // xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); | ||
| 358 | |||
| 359 | // real0 = _mm_add_ps(xmm4, real0); | ||
| 360 | // real1 = _mm_add_ps(xmm5, real1); | ||
| 361 | // im0 = _mm_add_ps(xmm6, im0); | ||
| 362 | // im1 = _mm_add_ps(xmm7, im1); | ||
| 363 | // } | ||
| 364 | |||
| 365 | // real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); | ||
| 366 | |||
| 367 | // im0 = _mm_add_ps(im0, im1); | ||
| 368 | // real0 = _mm_add_ps(real0, real1); | ||
| 369 | |||
| 370 | // im0 = _mm_add_ps(im0, real0); | ||
| 371 | |||
| 372 | // _mm_storel_pi(p_result, im0); | ||
| 373 | |||
| 374 | // for (i = num_points - isodd; i < num_points; i++) { | ||
| 375 | // *result += input[i] * taps[i]; | ||
| 376 | // } | ||
| 377 | // } | ||
| 378 | |||
| 379 | // #endif /*LV_HAVE_SSE4_1*/ | ||
| 380 | |||
| 381 | #ifdef LV_HAVE_AVX | ||
| 382 | |||
| 383 | #include <immintrin.h> | ||
| 384 | |||
| 385 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, | |
| 386 | const lv_32fc_t* input, | ||
| 387 | const lv_32fc_t* taps, | ||
| 388 | unsigned int num_points) | ||
| 389 | { | ||
| 390 | |||
| 391 | 2 | unsigned int isodd = num_points & 3; | |
| 392 | 2 | unsigned int i = 0; | |
| 393 | lv_32fc_t dotProduct; | ||
| 394 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
| 395 | |||
| 396 | 2 | unsigned int number = 0; | |
| 397 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 398 | |||
| 399 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
| 400 | |||
| 401 | 2 | const lv_32fc_t* a = input; | |
| 402 | 2 | const lv_32fc_t* b = taps; | |
| 403 | |||
| 404 | 2 | dotProdVal = _mm256_setzero_ps(); | |
| 405 | |||
| 406 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 407 | 65534 | x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
| 408 | 65534 | y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
| 409 | |||
| 410 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
| 411 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
| 412 | |||
| 413 | 65534 | tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... | |
| 414 | |||
| 415 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
| 416 | |||
| 417 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
| 418 | |||
| 419 | 65534 | z = _mm256_addsub_ps(tmp1, | |
| 420 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 421 | |||
| 422 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
| 423 | z); // Add the complex multiplication results together | ||
| 424 | |||
| 425 | 65534 | a += 4; | |
| 426 | 65534 | b += 4; | |
| 427 | } | ||
| 428 | |||
| 429 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
| 430 | |||
| 431 | _mm256_storeu_ps((float*)dotProductVector, | ||
| 432 | dotProdVal); // Store the results back into the dot product vector | ||
| 433 | |||
| 434 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
| 435 | 2 | dotProductVector[3]); | |
| 436 | |||
| 437 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
| 438 | 6 | dotProduct += input[i] * taps[i]; | |
| 439 | } | ||
| 440 | |||
| 441 | 2 | *result = dotProduct; | |
| 442 | 2 | } | |
| 443 | |||
| 444 | #endif /*LV_HAVE_AVX*/ | ||
| 445 | |||
| 446 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
| 447 | #include <immintrin.h> | ||
| 448 | |||
| 449 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, | |
| 450 | const lv_32fc_t* input, | ||
| 451 | const lv_32fc_t* taps, | ||
| 452 | unsigned int num_points) | ||
| 453 | { | ||
| 454 | |||
| 455 | 2 | unsigned int isodd = num_points & 3; | |
| 456 | 2 | unsigned int i = 0; | |
| 457 | lv_32fc_t dotProduct; | ||
| 458 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
| 459 | |||
| 460 | 2 | unsigned int number = 0; | |
| 461 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 462 | |||
| 463 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
| 464 | |||
| 465 | 2 | const lv_32fc_t* a = input; | |
| 466 | 2 | const lv_32fc_t* b = taps; | |
| 467 | |||
| 468 | 2 | dotProdVal = _mm256_setzero_ps(); | |
| 469 | |||
| 470 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 471 | |||
| 472 | 65534 | x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
| 473 | 65534 | y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
| 474 | |||
| 475 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
| 476 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
| 477 | |||
| 478 | 65534 | tmp1 = x; | |
| 479 | |||
| 480 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
| 481 | |||
| 482 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
| 483 | |||
| 484 | 65534 | z = _mm256_fmaddsub_ps( | |
| 485 | tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 486 | |||
| 487 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
| 488 | z); // Add the complex multiplication results together | ||
| 489 | |||
| 490 | 65534 | a += 4; | |
| 491 | 65534 | b += 4; | |
| 492 | } | ||
| 493 | |||
| 494 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
| 495 | |||
| 496 | _mm256_storeu_ps((float*)dotProductVector, | ||
| 497 | dotProdVal); // Store the results back into the dot product vector | ||
| 498 | |||
| 499 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
| 500 | 2 | dotProductVector[3]); | |
| 501 | |||
| 502 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
| 503 | 6 | dotProduct += input[i] * taps[i]; | |
| 504 | } | ||
| 505 | |||
| 506 | 2 | *result = dotProduct; | |
| 507 | 2 | } | |
| 508 | |||
| 509 | #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ | ||
| 510 | |||
| 511 | #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ | ||
| 512 | |||
| 513 | #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H | ||
| 514 | #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H | ||
| 515 | |||
| 516 | #include <stdio.h> | ||
| 517 | #include <string.h> | ||
| 518 | #include <volk/volk_common.h> | ||
| 519 | #include <volk/volk_complex.h> | ||
| 520 | |||
| 521 | |||
| 522 | #ifdef LV_HAVE_GENERIC | ||
| 523 | |||
| 524 | |||
| 525 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, | |
| 526 | const lv_32fc_t* input, | ||
| 527 | const lv_32fc_t* taps, | ||
| 528 | unsigned int num_points) | ||
| 529 | { | ||
| 530 | |||
| 531 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 532 | |||
| 533 | 2 | float* res = (float*)result; | |
| 534 | 2 | float* in = (float*)input; | |
| 535 | 2 | float* tp = (float*)taps; | |
| 536 | 2 | unsigned int n_2_ccomplex_blocks = num_bytes >> 4; | |
| 537 | |||
| 538 | 2 | float sum0[2] = { 0, 0 }; | |
| 539 | 2 | float sum1[2] = { 0, 0 }; | |
| 540 | 2 | unsigned int i = 0; | |
| 541 | |||
| 542 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
| 543 | 131070 | sum0[0] += in[0] * tp[0] - in[1] * tp[1]; | |
| 544 | 131070 | sum0[1] += in[0] * tp[1] + in[1] * tp[0]; | |
| 545 | 131070 | sum1[0] += in[2] * tp[2] - in[3] * tp[3]; | |
| 546 | 131070 | sum1[1] += in[2] * tp[3] + in[3] * tp[2]; | |
| 547 | |||
| 548 | 131070 | in += 4; | |
| 549 | 131070 | tp += 4; | |
| 550 | } | ||
| 551 | |||
| 552 | 2 | res[0] = sum0[0] + sum1[0]; | |
| 553 | 2 | res[1] = sum0[1] + sum1[1]; | |
| 554 | |||
| 555 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1) { |
| 556 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
| 557 | } | ||
| 558 | 2 | } | |
| 559 | |||
| 560 | #endif /*LV_HAVE_GENERIC*/ | ||
| 561 | |||
| 562 | |||
| 563 | #if LV_HAVE_SSE && LV_HAVE_64 | ||
| 564 | |||
| 565 | |||
| 566 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, | |
| 567 | const lv_32fc_t* input, | ||
| 568 | const lv_32fc_t* taps, | ||
| 569 | unsigned int num_points) | ||
| 570 | { | ||
| 571 | |||
| 572 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 573 | 2 | unsigned int isodd = num_points & 1; | |
| 574 | |||
| 575 | 2 | __VOLK_ASM( | |
| 576 | "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" | ||
| 577 | "# const float *taps, unsigned num_bytes)\n\t" | ||
| 578 | "# float sum0 = 0;\n\t" | ||
| 579 | "# float sum1 = 0;\n\t" | ||
| 580 | "# float sum2 = 0;\n\t" | ||
| 581 | "# float sum3 = 0;\n\t" | ||
| 582 | "# do {\n\t" | ||
| 583 | "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" | ||
| 584 | "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" | ||
| 585 | "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" | ||
| 586 | "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" | ||
| 587 | "# input += 4;\n\t" | ||
| 588 | "# taps += 4; \n\t" | ||
| 589 | "# } while (--n_2_ccomplex_blocks != 0);\n\t" | ||
| 590 | "# result[0] = sum0 + sum2;\n\t" | ||
| 591 | "# result[1] = sum1 + sum3;\n\t" | ||
| 592 | "# TODO: prefetch and better scheduling\n\t" | ||
| 593 | " xor %%r9, %%r9\n\t" | ||
| 594 | " xor %%r10, %%r10\n\t" | ||
| 595 | " movq %%rcx, %%rax\n\t" | ||
| 596 | " movq %%rcx, %%r8\n\t" | ||
| 597 | " movq %[rsi], %%r9\n\t" | ||
| 598 | " movq %[rdx], %%r10\n\t" | ||
| 599 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
| 600 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
| 601 | " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" | ||
| 602 | " shr $4, %%r8\n\t" | ||
| 603 | " jmp .%=L1_test\n\t" | ||
| 604 | " # 4 taps / loop\n\t" | ||
| 605 | " # something like ?? cycles / loop\n\t" | ||
| 606 | ".%=Loop1: \n\t" | ||
| 607 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
| 608 | "# movaps (%%r9), %%xmmA\n\t" | ||
| 609 | "# movaps (%%r10), %%xmmB\n\t" | ||
| 610 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 611 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
| 612 | "# mulps %%xmmB, %%xmmA\n\t" | ||
| 613 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
| 614 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
| 615 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
| 616 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 617 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
| 618 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
| 619 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
| 620 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
| 621 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
| 622 | "# addps %%xmmZ, %%xmmA\n\t" | ||
| 623 | "# addps %%xmmA, %%xmmC\n\t" | ||
| 624 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
| 625 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
| 626 | " movaps 0(%%r9), %%xmm0\n\t" | ||
| 627 | " movaps 16(%%r9), %%xmm1\n\t" | ||
| 628 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 629 | " movaps 0(%%r10), %%xmm2\n\t" | ||
| 630 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 631 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 632 | " movaps 16(%%r10), %%xmm3\n\t" | ||
| 633 | " movaps %%xmm1, %%xmm5\n\t" | ||
| 634 | " addps %%xmm0, %%xmm6\n\t" | ||
| 635 | " mulps %%xmm3, %%xmm1\n\t" | ||
| 636 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
| 637 | " addps %%xmm1, %%xmm6\n\t" | ||
| 638 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 639 | " addps %%xmm2, %%xmm7\n\t" | ||
| 640 | " mulps %%xmm5, %%xmm3\n\t" | ||
| 641 | " add $32, %%r9\n\t" | ||
| 642 | " addps %%xmm3, %%xmm7\n\t" | ||
| 643 | " add $32, %%r10\n\t" | ||
| 644 | ".%=L1_test:\n\t" | ||
| 645 | " dec %%rax\n\t" | ||
| 646 | " jge .%=Loop1\n\t" | ||
| 647 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
| 648 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
| 649 | " # If so, we've got 2 more taps to do.\n\t" | ||
| 650 | " and $1, %%r8\n\t" | ||
| 651 | " je .%=Leven\n\t" | ||
| 652 | " # The count was odd, do 2 more taps.\n\t" | ||
| 653 | " movaps 0(%%r9), %%xmm0\n\t" | ||
| 654 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 655 | " movaps 0(%%r10), %%xmm2\n\t" | ||
| 656 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 657 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 658 | " addps %%xmm0, %%xmm6\n\t" | ||
| 659 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 660 | " addps %%xmm2, %%xmm7\n\t" | ||
| 661 | ".%=Leven:\n\t" | ||
| 662 | " # neg inversor\n\t" | ||
| 663 | " xorps %%xmm1, %%xmm1\n\t" | ||
| 664 | " mov $0x80000000, %%r9\n\t" | ||
| 665 | " movd %%r9, %%xmm1\n\t" | ||
| 666 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
| 667 | " # pfpnacc\n\t" | ||
| 668 | " xorps %%xmm1, %%xmm6\n\t" | ||
| 669 | " movaps %%xmm6, %%xmm2\n\t" | ||
| 670 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
| 671 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
| 672 | " movaps %%xmm2, %%xmm3\n\t" | ||
| 673 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
| 674 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
| 675 | " addps %%xmm2, %%xmm6\n\t" | ||
| 676 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
| 677 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
| 678 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
| 679 | " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " | ||
| 680 | "to memory\n\t" | ||
| 681 | : | ||
| 682 | : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) | ||
| 683 | : "rax", "r8", "r9", "r10"); | ||
| 684 | |||
| 685 | |||
| 686 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
| 687 | 2 | *result += input[num_points - 1] * taps[num_points - 1]; | |
| 688 | } | ||
| 689 | |||
| 690 | 2 | return; | |
| 691 | } | ||
| 692 | |||
| 693 | #endif | ||
| 694 | |||
| 695 | #if LV_HAVE_SSE && LV_HAVE_32 | ||
| 696 | |||
| 697 | static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, | ||
| 698 | const lv_32fc_t* input, | ||
| 699 | const lv_32fc_t* taps, | ||
| 700 | unsigned int num_points) | ||
| 701 | { | ||
| 702 | |||
| 703 | volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); | ||
| 704 | |||
| 705 | #if 0 | ||
| 706 | const unsigned int num_bytes = num_points*8; | ||
| 707 | unsigned int isodd = num_points & 1; | ||
| 708 | |||
| 709 | __VOLK_ASM __VOLK_VOLATILE | ||
| 710 | ( | ||
| 711 | " #pushl %%ebp\n\t" | ||
| 712 | " #movl %%esp, %%ebp\n\t" | ||
| 713 | " movl 12(%%ebp), %%eax # input\n\t" | ||
| 714 | " movl 16(%%ebp), %%edx # taps\n\t" | ||
| 715 | " movl 20(%%ebp), %%ecx # n_bytes\n\t" | ||
| 716 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
| 717 | " movaps 0(%%eax), %%xmm0\n\t" | ||
| 718 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
| 719 | " movaps 0(%%edx), %%xmm2\n\t" | ||
| 720 | " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" | ||
| 721 | " jmp .%=L1_test\n\t" | ||
| 722 | " # 4 taps / loop\n\t" | ||
| 723 | " # something like ?? cycles / loop\n\t" | ||
| 724 | ".%=Loop1: \n\t" | ||
| 725 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
| 726 | "# movaps (%%eax), %%xmmA\n\t" | ||
| 727 | "# movaps (%%edx), %%xmmB\n\t" | ||
| 728 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 729 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
| 730 | "# mulps %%xmmB, %%xmmA\n\t" | ||
| 731 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
| 732 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
| 733 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
| 734 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 735 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
| 736 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
| 737 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
| 738 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
| 739 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
| 740 | "# addps %%xmmZ, %%xmmA\n\t" | ||
| 741 | "# addps %%xmmA, %%xmmC\n\t" | ||
| 742 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
| 743 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
| 744 | " movaps 16(%%eax), %%xmm1\n\t" | ||
| 745 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 746 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 747 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 748 | " movaps 16(%%edx), %%xmm3\n\t" | ||
| 749 | " movaps %%xmm1, %%xmm5\n\t" | ||
| 750 | " addps %%xmm0, %%xmm6\n\t" | ||
| 751 | " mulps %%xmm3, %%xmm1\n\t" | ||
| 752 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
| 753 | " addps %%xmm1, %%xmm6\n\t" | ||
| 754 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 755 | " movaps 32(%%eax), %%xmm0\n\t" | ||
| 756 | " addps %%xmm2, %%xmm7\n\t" | ||
| 757 | " mulps %%xmm5, %%xmm3\n\t" | ||
| 758 | " addl $32, %%eax\n\t" | ||
| 759 | " movaps 32(%%edx), %%xmm2\n\t" | ||
| 760 | " addps %%xmm3, %%xmm7\n\t" | ||
| 761 | " addl $32, %%edx\n\t" | ||
| 762 | ".%=L1_test:\n\t" | ||
| 763 | " decl %%ecx\n\t" | ||
| 764 | " jge .%=Loop1\n\t" | ||
| 765 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
| 766 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
| 767 | " # If so, we've got 2 more taps to do.\n\t" | ||
| 768 | " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" | ||
| 769 | " shrl $4, %%ecx\n\t" | ||
| 770 | " andl $1, %%ecx\n\t" | ||
| 771 | " je .%=Leven\n\t" | ||
| 772 | " # The count was odd, do 2 more taps.\n\t" | ||
| 773 | " # Note that we've already got mm0/mm2 preloaded\n\t" | ||
| 774 | " # from the main loop.\n\t" | ||
| 775 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 776 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 777 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 778 | " addps %%xmm0, %%xmm6\n\t" | ||
| 779 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 780 | " addps %%xmm2, %%xmm7\n\t" | ||
| 781 | ".%=Leven:\n\t" | ||
| 782 | " # neg inversor\n\t" | ||
| 783 | " movl 8(%%ebp), %%eax \n\t" | ||
| 784 | " xorps %%xmm1, %%xmm1\n\t" | ||
| 785 | " movl $0x80000000, (%%eax)\n\t" | ||
| 786 | " movss (%%eax), %%xmm1\n\t" | ||
| 787 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
| 788 | " # pfpnacc\n\t" | ||
| 789 | " xorps %%xmm1, %%xmm6\n\t" | ||
| 790 | " movaps %%xmm6, %%xmm2\n\t" | ||
| 791 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
| 792 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
| 793 | " movaps %%xmm2, %%xmm3\n\t" | ||
| 794 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
| 795 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
| 796 | " addps %%xmm2, %%xmm6\n\t" | ||
| 797 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
| 798 | " #movl 8(%%ebp), %%eax # @result\n\t" | ||
| 799 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
| 800 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
| 801 | " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" | ||
| 802 | " #popl %%ebp\n\t" | ||
| 803 | : | ||
| 804 | : | ||
| 805 | : "eax", "ecx", "edx" | ||
| 806 | ); | ||
| 807 | |||
| 808 | |||
| 809 | int getem = num_bytes % 16; | ||
| 810 | |||
| 811 | if(isodd) { | ||
| 812 | *result += (input[num_points - 1] * taps[num_points - 1]); | ||
| 813 | } | ||
| 814 | |||
| 815 | return; | ||
| 816 | #endif | ||
| 817 | } | ||
| 818 | |||
| 819 | #endif /*LV_HAVE_SSE*/ | ||
| 820 | |||
| 821 | #ifdef LV_HAVE_SSE3 | ||
| 822 | |||
| 823 | #include <pmmintrin.h> | ||
| 824 | |||
| 825 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, | |
| 826 | const lv_32fc_t* input, | ||
| 827 | const lv_32fc_t* taps, | ||
| 828 | unsigned int num_points) | ||
| 829 | { | ||
| 830 | |||
| 831 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 832 | 2 | unsigned int isodd = num_points & 1; | |
| 833 | |||
| 834 | lv_32fc_t dotProduct; | ||
| 835 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
| 836 | |||
| 837 | 2 | unsigned int number = 0; | |
| 838 | 2 | const unsigned int halfPoints = num_bytes >> 4; | |
| 839 | |||
| 840 | __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
| 841 | |||
| 842 | 2 | const lv_32fc_t* a = input; | |
| 843 | 2 | const lv_32fc_t* b = taps; | |
| 844 | |||
| 845 | 2 | dotProdVal = _mm_setzero_ps(); | |
| 846 | |||
| 847 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
| 848 | |||
| 849 | 131070 | x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
| 850 | 131070 | y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
| 851 | |||
| 852 | 131070 | yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | |
| 853 | 131070 | yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | |
| 854 | |||
| 855 | 131070 | tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | |
| 856 | |||
| 857 | 131070 | x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br | |
| 858 | |||
| 859 | 131070 | tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
| 860 | |||
| 861 | 131070 | z = _mm_addsub_ps(tmp1, | |
| 862 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 863 | |||
| 864 | dotProdVal = | ||
| 865 | 131070 | _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together | |
| 866 | |||
| 867 | 131070 | a += 2; | |
| 868 | 131070 | b += 2; | |
| 869 | } | ||
| 870 | |||
| 871 | __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; | ||
| 872 | |||
| 873 | _mm_store_ps((float*)dotProductVector, | ||
| 874 | dotProdVal); // Store the results back into the dot product vector | ||
| 875 | |||
| 876 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1]); | |
| 877 | |||
| 878 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (isodd) { |
| 879 | 2 | dotProduct += input[num_points - 1] * taps[num_points - 1]; | |
| 880 | } | ||
| 881 | |||
| 882 | 2 | *result = dotProduct; | |
| 883 | 2 | } | |
| 884 | |||
| 885 | #endif /*LV_HAVE_SSE3*/ | ||
| 886 | |||
| 887 | |||
| 888 | // #ifdef LV_HAVE_SSE4_1 | ||
| 889 | |||
| 890 | // #include <smmintrin.h> | ||
| 891 | |||
| 892 | // static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, | ||
| 893 | // const lv_32fc_t* input, | ||
| 894 | // const lv_32fc_t* taps, | ||
| 895 | // unsigned int num_points) | ||
| 896 | // { | ||
| 897 | |||
| 898 | // unsigned int i = 0; | ||
| 899 | // const unsigned int qtr_points = num_points / 4; | ||
| 900 | // const unsigned int isodd = num_points & 3; | ||
| 901 | |||
| 902 | // __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; | ||
| 903 | // float *p_input, *p_taps; | ||
| 904 | // __m64* p_result; | ||
| 905 | |||
| 906 | // static const __m128i neg = { 0x000000000000000080000000 }; | ||
| 907 | |||
| 908 | // p_result = (__m64*)result; | ||
| 909 | // p_input = (float*)input; | ||
| 910 | // p_taps = (float*)taps; | ||
| 911 | |||
| 912 | // real0 = _mm_setzero_ps(); | ||
| 913 | // real1 = _mm_setzero_ps(); | ||
| 914 | // im0 = _mm_setzero_ps(); | ||
| 915 | // im1 = _mm_setzero_ps(); | ||
| 916 | |||
| 917 | // for (; i < qtr_points; ++i) { | ||
| 918 | // xmm0 = _mm_load_ps(p_input); | ||
| 919 | // xmm1 = _mm_load_ps(p_taps); | ||
| 920 | |||
| 921 | // p_input += 4; | ||
| 922 | // p_taps += 4; | ||
| 923 | |||
| 924 | // xmm2 = _mm_load_ps(p_input); | ||
| 925 | // xmm3 = _mm_load_ps(p_taps); | ||
| 926 | |||
| 927 | // p_input += 4; | ||
| 928 | // p_taps += 4; | ||
| 929 | |||
| 930 | // xmm4 = _mm_unpackhi_ps(xmm0, xmm2); | ||
| 931 | // xmm5 = _mm_unpackhi_ps(xmm1, xmm3); | ||
| 932 | // xmm0 = _mm_unpacklo_ps(xmm0, xmm2); | ||
| 933 | // xmm2 = _mm_unpacklo_ps(xmm1, xmm3); | ||
| 934 | |||
| 935 | // // imaginary vector from input | ||
| 936 | // xmm1 = _mm_unpackhi_ps(xmm0, xmm4); | ||
| 937 | // // real vector from input | ||
| 938 | // xmm3 = _mm_unpacklo_ps(xmm0, xmm4); | ||
| 939 | // // imaginary vector from taps | ||
| 940 | // xmm0 = _mm_unpackhi_ps(xmm2, xmm5); | ||
| 941 | // // real vector from taps | ||
| 942 | // xmm2 = _mm_unpacklo_ps(xmm2, xmm5); | ||
| 943 | |||
| 944 | // xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); | ||
| 945 | // xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); | ||
| 946 | |||
| 947 | // xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); | ||
| 948 | // xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); | ||
| 949 | |||
| 950 | // real0 = _mm_add_ps(xmm4, real0); | ||
| 951 | // real1 = _mm_add_ps(xmm5, real1); | ||
| 952 | // im0 = _mm_add_ps(xmm6, im0); | ||
| 953 | // im1 = _mm_add_ps(xmm7, im1); | ||
| 954 | // } | ||
| 955 | |||
| 956 | // real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); | ||
| 957 | |||
| 958 | // im0 = _mm_add_ps(im0, im1); | ||
| 959 | // real0 = _mm_add_ps(real0, real1); | ||
| 960 | |||
| 961 | // im0 = _mm_add_ps(im0, real0); | ||
| 962 | |||
| 963 | // _mm_storel_pi(p_result, im0); | ||
| 964 | |||
| 965 | // for (i = num_points - isodd; i < num_points; i++) { | ||
| 966 | // *result += input[i] * taps[i]; | ||
| 967 | // } | ||
| 968 | // } | ||
| 969 | |||
| 970 | // #endif /*LV_HAVE_SSE4_1*/ | ||
| 971 | |||
| 972 | #ifdef LV_HAVE_NEON | ||
| 973 | #include <arm_neon.h> | ||
| 974 | |||
| 975 | static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, | ||
| 976 | const lv_32fc_t* input, | ||
| 977 | const lv_32fc_t* taps, | ||
| 978 | unsigned int num_points) | ||
| 979 | { | ||
| 980 | |||
| 981 | unsigned int quarter_points = num_points / 4; | ||
| 982 | unsigned int number; | ||
| 983 | |||
| 984 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
| 985 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
| 986 | // for 2-lane vectors, 1st lane holds the real part, | ||
| 987 | // 2nd lane holds the imaginary part | ||
| 988 | float32x4x2_t a_val, b_val, c_val, accumulator; | ||
| 989 | float32x4x2_t tmp_real, tmp_imag; | ||
| 990 | accumulator.val[0] = vdupq_n_f32(0); | ||
| 991 | accumulator.val[1] = vdupq_n_f32(0); | ||
| 992 | |||
| 993 | for (number = 0; number < quarter_points; ++number) { | ||
| 994 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
| 995 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
| 996 | __VOLK_PREFETCH(a_ptr + 8); | ||
| 997 | __VOLK_PREFETCH(b_ptr + 8); | ||
| 998 | |||
| 999 | // multiply the real*real and imag*imag to get real result | ||
| 1000 | // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||
| 1001 | tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
| 1002 | // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i | ||
| 1003 | tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); | ||
| 1004 | |||
| 1005 | // Multiply cross terms to get the imaginary result | ||
| 1006 | // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i | ||
| 1007 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); | ||
| 1008 | // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r | ||
| 1009 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
| 1010 | |||
| 1011 | c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); | ||
| 1012 | c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); | ||
| 1013 | |||
| 1014 | accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]); | ||
| 1015 | accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]); | ||
| 1016 | |||
| 1017 | a_ptr += 4; | ||
| 1018 | b_ptr += 4; | ||
| 1019 | } | ||
| 1020 | lv_32fc_t accum_result[4]; | ||
| 1021 | vst2q_f32((float*)accum_result, accumulator); | ||
| 1022 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
| 1023 | |||
| 1024 | // tail case | ||
| 1025 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
| 1026 | *result += (*a_ptr++) * (*b_ptr++); | ||
| 1027 | } | ||
| 1028 | } | ||
| 1029 | #endif /*LV_HAVE_NEON*/ | ||
| 1030 | |||
| 1031 | #ifdef LV_HAVE_NEON | ||
| 1032 | #include <arm_neon.h> | ||
| 1033 | static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, | ||
| 1034 | const lv_32fc_t* input, | ||
| 1035 | const lv_32fc_t* taps, | ||
| 1036 | unsigned int num_points) | ||
| 1037 | { | ||
| 1038 | |||
| 1039 | unsigned int quarter_points = num_points / 4; | ||
| 1040 | unsigned int number; | ||
| 1041 | |||
| 1042 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
| 1043 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
| 1044 | // for 2-lane vectors, 1st lane holds the real part, | ||
| 1045 | // 2nd lane holds the imaginary part | ||
| 1046 | float32x4x2_t a_val, b_val, accumulator; | ||
| 1047 | float32x4x2_t tmp_imag; | ||
| 1048 | accumulator.val[0] = vdupq_n_f32(0); | ||
| 1049 | accumulator.val[1] = vdupq_n_f32(0); | ||
| 1050 | |||
| 1051 | for (number = 0; number < quarter_points; ++number) { | ||
| 1052 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
| 1053 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
| 1054 | __VOLK_PREFETCH(a_ptr + 8); | ||
| 1055 | __VOLK_PREFETCH(b_ptr + 8); | ||
| 1056 | |||
| 1057 | // do the first multiply | ||
| 1058 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
| 1059 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
| 1060 | |||
| 1061 | // use multiply accumulate/subtract to get result | ||
| 1062 | tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); | ||
| 1063 | tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); | ||
| 1064 | |||
| 1065 | accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]); | ||
| 1066 | accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]); | ||
| 1067 | |||
| 1068 | // increment pointers | ||
| 1069 | a_ptr += 4; | ||
| 1070 | b_ptr += 4; | ||
| 1071 | } | ||
| 1072 | lv_32fc_t accum_result[4]; | ||
| 1073 | vst2q_f32((float*)accum_result, accumulator); | ||
| 1074 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
| 1075 | |||
| 1076 | // tail case | ||
| 1077 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
| 1078 | *result += (*a_ptr++) * (*b_ptr++); | ||
| 1079 | } | ||
| 1080 | } | ||
| 1081 | #endif /*LV_HAVE_NEON*/ | ||
| 1082 | |||
| 1083 | #ifdef LV_HAVE_NEON | ||
| 1084 | static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, | ||
| 1085 | const lv_32fc_t* input, | ||
| 1086 | const lv_32fc_t* taps, | ||
| 1087 | unsigned int num_points) | ||
| 1088 | { | ||
| 1089 | |||
| 1090 | unsigned int quarter_points = num_points / 4; | ||
| 1091 | unsigned int number; | ||
| 1092 | |||
| 1093 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
| 1094 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
| 1095 | // for 2-lane vectors, 1st lane holds the real part, | ||
| 1096 | // 2nd lane holds the imaginary part | ||
| 1097 | float32x4x2_t a_val, b_val, accumulator1, accumulator2; | ||
| 1098 | accumulator1.val[0] = vdupq_n_f32(0); | ||
| 1099 | accumulator1.val[1] = vdupq_n_f32(0); | ||
| 1100 | accumulator2.val[0] = vdupq_n_f32(0); | ||
| 1101 | accumulator2.val[1] = vdupq_n_f32(0); | ||
| 1102 | |||
| 1103 | for (number = 0; number < quarter_points; ++number) { | ||
| 1104 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
| 1105 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
| 1106 | __VOLK_PREFETCH(a_ptr + 8); | ||
| 1107 | __VOLK_PREFETCH(b_ptr + 8); | ||
| 1108 | |||
| 1109 | // use 2 accumulators to remove inter-instruction data dependencies | ||
| 1110 | accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); | ||
| 1111 | accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]); | ||
| 1112 | accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]); | ||
| 1113 | accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]); | ||
| 1114 | // increment pointers | ||
| 1115 | a_ptr += 4; | ||
| 1116 | b_ptr += 4; | ||
| 1117 | } | ||
| 1118 | accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]); | ||
| 1119 | accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]); | ||
| 1120 | lv_32fc_t accum_result[4]; | ||
| 1121 | vst2q_f32((float*)accum_result, accumulator1); | ||
| 1122 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
| 1123 | |||
| 1124 | // tail case | ||
| 1125 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
| 1126 | *result += (*a_ptr++) * (*b_ptr++); | ||
| 1127 | } | ||
| 1128 | } | ||
| 1129 | #endif /*LV_HAVE_NEON*/ | ||
| 1130 | |||
| 1131 | #ifdef LV_HAVE_NEON | ||
| 1132 | static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, | ||
| 1133 | const lv_32fc_t* input, | ||
| 1134 | const lv_32fc_t* taps, | ||
| 1135 | unsigned int num_points) | ||
| 1136 | { | ||
| 1137 | // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very | ||
| 1138 | // fast | ||
| 1139 | |||
| 1140 | unsigned int quarter_points = num_points / 8; | ||
| 1141 | unsigned int number; | ||
| 1142 | |||
| 1143 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
| 1144 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
| 1145 | // for 2-lane vectors, 1st lane holds the real part, | ||
| 1146 | // 2nd lane holds the imaginary part | ||
| 1147 | float32x4x4_t a_val, b_val, accumulator1, accumulator2; | ||
| 1148 | float32x4x2_t reduced_accumulator; | ||
| 1149 | accumulator1.val[0] = vdupq_n_f32(0); | ||
| 1150 | accumulator1.val[1] = vdupq_n_f32(0); | ||
| 1151 | accumulator1.val[2] = vdupq_n_f32(0); | ||
| 1152 | accumulator1.val[3] = vdupq_n_f32(0); | ||
| 1153 | accumulator2.val[0] = vdupq_n_f32(0); | ||
| 1154 | accumulator2.val[1] = vdupq_n_f32(0); | ||
| 1155 | accumulator2.val[2] = vdupq_n_f32(0); | ||
| 1156 | accumulator2.val[3] = vdupq_n_f32(0); | ||
| 1157 | |||
| 1158 | // 8 input regs, 8 accumulators -> 16/16 neon regs are used | ||
| 1159 | for (number = 0; number < quarter_points; ++number) { | ||
| 1160 | a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
| 1161 | b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
| 1162 | __VOLK_PREFETCH(a_ptr + 8); | ||
| 1163 | __VOLK_PREFETCH(b_ptr + 8); | ||
| 1164 | |||
| 1165 | // use 2 accumulators to remove inter-instruction data dependencies | ||
| 1166 | accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); | ||
| 1167 | accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]); | ||
| 1168 | |||
| 1169 | accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]); | ||
| 1170 | accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]); | ||
| 1171 | |||
| 1172 | accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]); | ||
| 1173 | accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]); | ||
| 1174 | |||
| 1175 | accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]); | ||
| 1176 | accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]); | ||
| 1177 | // increment pointers | ||
| 1178 | a_ptr += 8; | ||
| 1179 | b_ptr += 8; | ||
| 1180 | } | ||
| 1181 | // reduce 8 accumulator lanes down to 2 (1 real and 1 imag) | ||
| 1182 | accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]); | ||
| 1183 | accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]); | ||
| 1184 | accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]); | ||
| 1185 | accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]); | ||
| 1186 | reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]); | ||
| 1187 | reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]); | ||
| 1188 | // now reduce accumulators to scalars | ||
| 1189 | lv_32fc_t accum_result[4]; | ||
| 1190 | vst2q_f32((float*)accum_result, reduced_accumulator); | ||
| 1191 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
| 1192 | |||
| 1193 | // tail case | ||
| 1194 | for (number = quarter_points * 8; number < num_points; ++number) { | ||
| 1195 | *result += (*a_ptr++) * (*b_ptr++); | ||
| 1196 | } | ||
| 1197 | } | ||
| 1198 | #endif /*LV_HAVE_NEON*/ | ||
| 1199 | |||
| 1200 | |||
| 1201 | #ifdef LV_HAVE_AVX | ||
| 1202 | |||
| 1203 | #include <immintrin.h> | ||
| 1204 | |||
| 1205 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, | |
| 1206 | const lv_32fc_t* input, | ||
| 1207 | const lv_32fc_t* taps, | ||
| 1208 | unsigned int num_points) | ||
| 1209 | { | ||
| 1210 | |||
| 1211 | 2 | unsigned int isodd = num_points & 3; | |
| 1212 | 2 | unsigned int i = 0; | |
| 1213 | lv_32fc_t dotProduct; | ||
| 1214 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
| 1215 | |||
| 1216 | 2 | unsigned int number = 0; | |
| 1217 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 1218 | |||
| 1219 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
| 1220 | |||
| 1221 | 2 | const lv_32fc_t* a = input; | |
| 1222 | 2 | const lv_32fc_t* b = taps; | |
| 1223 | |||
| 1224 | 2 | dotProdVal = _mm256_setzero_ps(); | |
| 1225 | |||
| 1226 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 1227 | |||
| 1228 | 65534 | x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
| 1229 | 65534 | y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
| 1230 | |||
| 1231 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
| 1232 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
| 1233 | |||
| 1234 | 65534 | tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... | |
| 1235 | |||
| 1236 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
| 1237 | |||
| 1238 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
| 1239 | |||
| 1240 | 65534 | z = _mm256_addsub_ps(tmp1, | |
| 1241 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 1242 | |||
| 1243 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
| 1244 | z); // Add the complex multiplication results together | ||
| 1245 | |||
| 1246 | 65534 | a += 4; | |
| 1247 | 65534 | b += 4; | |
| 1248 | } | ||
| 1249 | |||
| 1250 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
| 1251 | |||
| 1252 | _mm256_store_ps((float*)dotProductVector, | ||
| 1253 | dotProdVal); // Store the results back into the dot product vector | ||
| 1254 | |||
| 1255 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
| 1256 | 2 | dotProductVector[3]); | |
| 1257 | |||
| 1258 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
| 1259 | 6 | dotProduct += input[i] * taps[i]; | |
| 1260 | } | ||
| 1261 | |||
| 1262 | 2 | *result = dotProduct; | |
| 1263 | 2 | } | |
| 1264 | |||
| 1265 | #endif /*LV_HAVE_AVX*/ | ||
| 1266 | |||
| 1267 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
| 1268 | #include <immintrin.h> | ||
| 1269 | |||
| 1270 | 2 | static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, | |
| 1271 | const lv_32fc_t* input, | ||
| 1272 | const lv_32fc_t* taps, | ||
| 1273 | unsigned int num_points) | ||
| 1274 | { | ||
| 1275 | |||
| 1276 | 2 | unsigned int isodd = num_points & 3; | |
| 1277 | 2 | unsigned int i = 0; | |
| 1278 | lv_32fc_t dotProduct; | ||
| 1279 | 2 | memset(&dotProduct, 0x0, 2 * sizeof(float)); | |
| 1280 | |||
| 1281 | 2 | unsigned int number = 0; | |
| 1282 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 1283 | |||
| 1284 | __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; | ||
| 1285 | |||
| 1286 | 2 | const lv_32fc_t* a = input; | |
| 1287 | 2 | const lv_32fc_t* b = taps; | |
| 1288 | |||
| 1289 | 2 | dotProdVal = _mm256_setzero_ps(); | |
| 1290 | |||
| 1291 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 1292 | |||
| 1293 | 65534 | x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi | |
| 1294 | 65534 | y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi | |
| 1295 | |||
| 1296 | 65534 | yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr | |
| 1297 | 65534 | yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi | |
| 1298 | |||
| 1299 | 65534 | tmp1 = x; | |
| 1300 | |||
| 1301 | 65534 | x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr | |
| 1302 | |||
| 1303 | 65534 | tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... | |
| 1304 | |||
| 1305 | 65534 | z = _mm256_fmaddsub_ps( | |
| 1306 | tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
| 1307 | |||
| 1308 | 65534 | dotProdVal = _mm256_add_ps(dotProdVal, | |
| 1309 | z); // Add the complex multiplication results together | ||
| 1310 | |||
| 1311 | 65534 | a += 4; | |
| 1312 | 65534 | b += 4; | |
| 1313 | } | ||
| 1314 | |||
| 1315 | __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; | ||
| 1316 | |||
| 1317 | _mm256_store_ps((float*)dotProductVector, | ||
| 1318 | dotProdVal); // Store the results back into the dot product vector | ||
| 1319 | |||
| 1320 | 2 | dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + | |
| 1321 | 2 | dotProductVector[3]); | |
| 1322 | |||
| 1323 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = num_points - isodd; i < num_points; i++) { |
| 1324 | 6 | dotProduct += input[i] * taps[i]; | |
| 1325 | } | ||
| 1326 | |||
| 1327 | 2 | *result = dotProduct; | |
| 1328 | 2 | } | |
| 1329 | |||
| 1330 | #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ | ||
| 1331 | |||
| 1332 | |||
| 1333 | #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ | ||
| 1334 |