| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32fc_x2_conjugate_dot_prod_32fc | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * This block computes the conjugate dot product (or inner product) | ||
| 16 | * between two vectors, the \p input and \p taps vectors. Given a set | ||
| 17 | * of \p num_points taps, the result is the sum of products between | ||
| 18 | * the input vector and the conjugate of the taps. The result is a | ||
| 19 | * single value stored in the \p result address and is returned as a | ||
| 20 | * complex float. | ||
| 21 | * | ||
| 22 | * <b>Dispatcher Prototype</b> | ||
| 23 | * \code | ||
| 24 | * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, | ||
| 25 | * const lv_32fc_t* taps, unsigned int num_points) \endcode | ||
| 26 | * | ||
| 27 | * \b Inputs | ||
| 28 | * \li input: vector of complex floats. | ||
| 29 | * \li taps: complex float taps. | ||
| 30 | * \li num_points: number of samples in both \p input and \p taps. | ||
| 31 | * | ||
| 32 | * \b Outputs | ||
| 33 | * \li result: pointer to a complex float value to hold the dot product result. | ||
| 34 | * | ||
| 35 | * \b Example | ||
| 36 | * \code | ||
| 37 | * unsigned int N = 1000; | ||
| 38 | * unsigned int alignment = volk_get_alignment(); | ||
| 39 | * | ||
| 40 | * lv_32fc_t* a = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment); | ||
| 41 | * lv_32fc_t* b = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment); | ||
| 42 | * | ||
| 43 | * for (int i = 0; i < N; ++i) { | ||
| 44 | * a[i] = lv_cmake(.50f, .50f); | ||
| 45 | * b[i] = lv_cmake(.50f, .75f); | ||
| 46 | * } | ||
| 47 | * | ||
| 48 | * lv_32fc_t e = (float) N * a[0] * lv_conj(b[0]); // When a and b constant | ||
| 49 | * lv_32fc_t res; | ||
| 50 | * | ||
| 51 | * volk_32fc_x2_conjugate_dot_prod_32fc(&res, a, b, N); | ||
| 52 | * | ||
| 53 | * printf("Expected: %8.2f%+8.2fi\n", lv_real(e), lv_imag(e)); | ||
| 54 | * printf("Result: %8.2f%+8.2fi\n", lv_real(res), lv_imag(res)); | ||
| 55 | * | ||
| 56 | * volk_free(a); | ||
| 57 | * volk_free(b); | ||
| 58 | * \endcode | ||
| 59 | */ | ||
| 60 | |||
| 61 | #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H | ||
| 62 | #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H | ||
| 63 | |||
| 64 | |||
| 65 | #include <volk/volk_complex.h> | ||
| 66 | |||
| 67 | |||
| 68 | #ifdef LV_HAVE_GENERIC | ||
| 69 | |||
| 70 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, | |
| 71 | const lv_32fc_t* input, | ||
| 72 | const lv_32fc_t* taps, | ||
| 73 | unsigned int num_points) | ||
| 74 | { | ||
| 75 | 2 | lv_32fc_t res = lv_cmake(0.f, 0.f); | |
| 76 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (unsigned int i = 0; i < num_points; ++i) { |
| 77 | 262142 | res += (*input++) * lv_conj((*taps++)); | |
| 78 | } | ||
| 79 | 2 | *result = res; | |
| 80 | 2 | } | |
| 81 | |||
| 82 | #endif /*LV_HAVE_GENERIC*/ | ||
| 83 | |||
| 84 | #ifdef LV_HAVE_GENERIC | ||
| 85 | |||
| 86 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_block(lv_32fc_t* result, | |
| 87 | const lv_32fc_t* input, | ||
| 88 | const lv_32fc_t* taps, | ||
| 89 | unsigned int num_points) | ||
| 90 | { | ||
| 91 | |||
| 92 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 93 | |||
| 94 | 2 | float* res = (float*)result; | |
| 95 | 2 | float* in = (float*)input; | |
| 96 | 2 | float* tp = (float*)taps; | |
| 97 | 2 | unsigned int n_2_ccomplex_blocks = num_bytes >> 4; | |
| 98 | |||
| 99 | 2 | float sum0[2] = { 0, 0 }; | |
| 100 | 2 | float sum1[2] = { 0, 0 }; | |
| 101 | 2 | unsigned int i = 0; | |
| 102 | |||
| 103 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
| 104 | 131070 | sum0[0] += in[0] * tp[0] + in[1] * tp[1]; | |
| 105 | 131070 | sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; | |
| 106 | 131070 | sum1[0] += in[2] * tp[2] + in[3] * tp[3]; | |
| 107 | 131070 | sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; | |
| 108 | |||
| 109 | 131070 | in += 4; | |
| 110 | 131070 | tp += 4; | |
| 111 | } | ||
| 112 | |||
| 113 | 2 | res[0] = sum0[0] + sum1[0]; | |
| 114 | 2 | res[1] = sum0[1] + sum1[1]; | |
| 115 | |||
| 116 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 3 & 1) { |
| 117 | 2 | *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); | |
| 118 | } | ||
| 119 | 2 | } | |
| 120 | |||
| 121 | #endif /*LV_HAVE_GENERIC*/ | ||
| 122 | |||
| 123 | #ifdef LV_HAVE_AVX | ||
| 124 | |||
| 125 | #include <immintrin.h> | ||
| 126 | |||
| 127 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result, | |
| 128 | const lv_32fc_t* input, | ||
| 129 | const lv_32fc_t* taps, | ||
| 130 | unsigned int num_points) | ||
| 131 | { | ||
| 132 | // Partial sums for indices i, i+1, i+2 and i+3. | ||
| 133 | 2 | __m256 sum_a_mult_b_real = _mm256_setzero_ps(); | |
| 134 | 2 | __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); | |
| 135 | |||
| 136 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { |
| 137 | /* Four complex elements a time are processed. | ||
| 138 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
| 139 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
| 140 | */ | ||
| 141 | |||
| 142 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
| 143 | * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
| 144 | * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
| 145 | * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
| 146 | * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
| 147 | */ | ||
| 148 | 65534 | __m256 a = _mm256_loadu_ps((const float*)&input[i]); | |
| 149 | 131068 | __m256 b = _mm256_loadu_ps((const float*)&taps[i]); | |
| 150 | 65534 | __m256 b_real = _mm256_moveldup_ps(b); | |
| 151 | 65534 | __m256 b_imag = _mm256_movehdup_ps(b); | |
| 152 | |||
| 153 | // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
| 154 | 131068 | sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); | |
| 155 | // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
| 156 | 131068 | sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); | |
| 157 | } | ||
| 158 | |||
| 159 | // Swap position of −ar⋅bi and ai⋅bi. | ||
| 160 | 2 | sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
| 161 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. | ||
| 162 | 2 | __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
| 163 | /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. | ||
| 164 | * s1 + s3 and s0 + s2 … | ||
| 165 | */ | ||
| 166 | 2 | sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); | |
| 167 | // … and now (s0 + s2) + (s1 + s3) | ||
| 168 | 2 | sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
| 169 | // Store result. | ||
| 170 | 2 | __m128 lower = _mm256_extractf128_ps(sum, 0); | |
| 171 | _mm_storel_pi((__m64*)result, lower); | ||
| 172 | |||
| 173 | // Handle the last elements if num_points mod 4 is bigger than 0. | ||
| 174 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (long unsigned i = num_points & ~3u; i < num_points; ++i) { |
| 175 | 6 | *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + | |
| 176 | lv_cimag(input[i]) * lv_cimag(taps[i]), | ||
| 177 | lv_cimag(input[i]) * lv_creal(taps[i]) - | ||
| 178 | lv_creal(input[i]) * lv_cimag(taps[i])); | ||
| 179 | } | ||
| 180 | 2 | } | |
| 181 | |||
| 182 | #endif /* LV_HAVE_AVX */ | ||
| 183 | |||
| 184 | #ifdef LV_HAVE_SSE3 | ||
| 185 | |||
| 186 | #include <pmmintrin.h> | ||
| 187 | #include <xmmintrin.h> | ||
| 188 | |||
| 189 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, | |
| 190 | const lv_32fc_t* input, | ||
| 191 | const lv_32fc_t* taps, | ||
| 192 | unsigned int num_points) | ||
| 193 | { | ||
| 194 | // Partial sums for indices i and i+1. | ||
| 195 | 2 | __m128 sum_a_mult_b_real = _mm_setzero_ps(); | |
| 196 | 2 | __m128 sum_a_mult_b_imag = _mm_setzero_ps(); | |
| 197 | |||
| 198 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { |
| 199 | /* Two complex elements a time are processed. | ||
| 200 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
| 201 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
| 202 | */ | ||
| 203 | |||
| 204 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
| 205 | * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
| 206 | * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
| 207 | * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
| 208 | * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
| 209 | */ | ||
| 210 | 131070 | __m128 a = _mm_loadu_ps((const float*)&input[i]); | |
| 211 | 262140 | __m128 b = _mm_loadu_ps((const float*)&taps[i]); | |
| 212 | 131070 | __m128 b_real = _mm_moveldup_ps(b); | |
| 213 | 131070 | __m128 b_imag = _mm_movehdup_ps(b); | |
| 214 | |||
| 215 | // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
| 216 | 262140 | sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); | |
| 217 | // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
| 218 | 262140 | sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); | |
| 219 | } | ||
| 220 | |||
| 221 | // Swap position of −ar⋅bi and ai⋅bi. | ||
| 222 | sum_a_mult_b_imag = | ||
| 223 | 2 | _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
| 224 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. | ||
| 225 | 2 | __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
| 226 | // Sum the two partial sums. | ||
| 227 | 4 | sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
| 228 | // Store result. | ||
| 229 | _mm_storel_pi((__m64*)result, sum); | ||
| 230 | |||
| 231 | // Handle the last element if num_points mod 2 is 1. | ||
| 232 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1u) { |
| 233 | 2 | *result += lv_cmake( | |
| 234 | lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + | ||
| 235 | lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), | ||
| 236 | lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - | ||
| 237 | lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); | ||
| 238 | } | ||
| 239 | 2 | } | |
| 240 | |||
| 241 | #endif /*LV_HAVE_SSE3*/ | ||
| 242 | |||
| 243 | #ifdef LV_HAVE_NEON | ||
| 244 | #include <arm_neon.h> | ||
| 245 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, | ||
| 246 | const lv_32fc_t* input, | ||
| 247 | const lv_32fc_t* taps, | ||
| 248 | unsigned int num_points) | ||
| 249 | { | ||
| 250 | |||
| 251 | unsigned int quarter_points = num_points / 4; | ||
| 252 | unsigned int number; | ||
| 253 | |||
| 254 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
| 255 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
| 256 | // for 2-lane vectors, 1st lane holds the real part, | ||
| 257 | // 2nd lane holds the imaginary part | ||
| 258 | float32x4x2_t a_val, b_val, accumulator; | ||
| 259 | float32x4x2_t tmp_imag; | ||
| 260 | accumulator.val[0] = vdupq_n_f32(0); | ||
| 261 | accumulator.val[1] = vdupq_n_f32(0); | ||
| 262 | |||
| 263 | for (number = 0; number < quarter_points; ++number) { | ||
| 264 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
| 265 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
| 266 | __VOLK_PREFETCH(a_ptr + 8); | ||
| 267 | __VOLK_PREFETCH(b_ptr + 8); | ||
| 268 | |||
| 269 | // do the first multiply | ||
| 270 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
| 271 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
| 272 | |||
| 273 | // use multiply accumulate/subtract to get result | ||
| 274 | tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); | ||
| 275 | tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); | ||
| 276 | |||
| 277 | accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]); | ||
| 278 | accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]); | ||
| 279 | |||
| 280 | // increment pointers | ||
| 281 | a_ptr += 4; | ||
| 282 | b_ptr += 4; | ||
| 283 | } | ||
| 284 | lv_32fc_t accum_result[4]; | ||
| 285 | vst2q_f32((float*)accum_result, accumulator); | ||
| 286 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
| 287 | |||
| 288 | // tail case | ||
| 289 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
| 290 | *result += (*a_ptr++) * lv_conj(*b_ptr++); | ||
| 291 | } | ||
| 292 | *result = lv_conj(*result); | ||
| 293 | } | ||
| 294 | #endif /*LV_HAVE_NEON*/ | ||
| 295 | |||
| 296 | #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ | ||
| 297 | |||
| 298 | #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H | ||
| 299 | #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H | ||
| 300 | |||
| 301 | #include <stdio.h> | ||
| 302 | #include <volk/volk_common.h> | ||
| 303 | #include <volk/volk_complex.h> | ||
| 304 | |||
| 305 | |||
| 306 | #ifdef LV_HAVE_AVX | ||
| 307 | #include <immintrin.h> | ||
| 308 | |||
| 309 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result, | |
| 310 | const lv_32fc_t* input, | ||
| 311 | const lv_32fc_t* taps, | ||
| 312 | unsigned int num_points) | ||
| 313 | { | ||
| 314 | // Partial sums for indices i, i+1, i+2 and i+3. | ||
| 315 | 2 | __m256 sum_a_mult_b_real = _mm256_setzero_ps(); | |
| 316 | 2 | __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); | |
| 317 | |||
| 318 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { |
| 319 | /* Four complex elements a time are processed. | ||
| 320 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
| 321 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
| 322 | */ | ||
| 323 | |||
| 324 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
| 325 | * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
| 326 | * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
| 327 | * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
| 328 | * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
| 329 | */ | ||
| 330 | 65534 | __m256 a = _mm256_load_ps((const float*)&input[i]); | |
| 331 | 131068 | __m256 b = _mm256_load_ps((const float*)&taps[i]); | |
| 332 | 65534 | __m256 b_real = _mm256_moveldup_ps(b); | |
| 333 | 65534 | __m256 b_imag = _mm256_movehdup_ps(b); | |
| 334 | |||
| 335 | // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
| 336 | 131068 | sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); | |
| 337 | // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
| 338 | 131068 | sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); | |
| 339 | } | ||
| 340 | |||
| 341 | // Swap position of −ar⋅bi and ai⋅bi. | ||
| 342 | 2 | sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
| 343 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. | ||
| 344 | 2 | __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
| 345 | /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. | ||
| 346 | * s1 + s3 and s0 + s2 … | ||
| 347 | */ | ||
| 348 | 2 | sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); | |
| 349 | // … and now (s0 + s2) + (s1 + s3) | ||
| 350 | 2 | sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
| 351 | // Store result. | ||
| 352 | 2 | __m128 lower = _mm256_extractf128_ps(sum, 0); | |
| 353 | _mm_storel_pi((__m64*)result, lower); | ||
| 354 | |||
| 355 | // Handle the last elements if num_points mod 4 is bigger than 0. | ||
| 356 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (long unsigned i = num_points & ~3u; i < num_points; ++i) { |
| 357 | 6 | *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + | |
| 358 | lv_cimag(input[i]) * lv_cimag(taps[i]), | ||
| 359 | lv_cimag(input[i]) * lv_creal(taps[i]) - | ||
| 360 | lv_creal(input[i]) * lv_cimag(taps[i])); | ||
| 361 | } | ||
| 362 | 2 | } | |
| 363 | #endif /* LV_HAVE_AVX */ | ||
| 364 | |||
| 365 | #ifdef LV_HAVE_SSE3 | ||
| 366 | |||
| 367 | #include <pmmintrin.h> | ||
| 368 | #include <xmmintrin.h> | ||
| 369 | |||
| 370 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result, | |
| 371 | const lv_32fc_t* input, | ||
| 372 | const lv_32fc_t* taps, | ||
| 373 | unsigned int num_points) | ||
| 374 | { | ||
| 375 | // Partial sums for indices i and i+1. | ||
| 376 | 2 | __m128 sum_a_mult_b_real = _mm_setzero_ps(); | |
| 377 | 2 | __m128 sum_a_mult_b_imag = _mm_setzero_ps(); | |
| 378 | |||
| 379 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { |
| 380 | /* Two complex elements a time are processed. | ||
| 381 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
| 382 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
| 383 | */ | ||
| 384 | |||
| 385 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
| 386 | * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
| 387 | * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
| 388 | * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
| 389 | * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
| 390 | */ | ||
| 391 | 131070 | __m128 a = _mm_load_ps((const float*)&input[i]); | |
| 392 | 262140 | __m128 b = _mm_load_ps((const float*)&taps[i]); | |
| 393 | 131070 | __m128 b_real = _mm_moveldup_ps(b); | |
| 394 | 131070 | __m128 b_imag = _mm_movehdup_ps(b); | |
| 395 | |||
| 396 | // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
| 397 | 262140 | sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); | |
| 398 | // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
| 399 | 262140 | sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); | |
| 400 | } | ||
| 401 | |||
| 402 | // Swap position of −ar⋅bi and ai⋅bi. | ||
| 403 | sum_a_mult_b_imag = | ||
| 404 | 2 | _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
| 405 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. | ||
| 406 | 2 | __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
| 407 | // Sum the two partial sums. | ||
| 408 | 4 | sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
| 409 | // Store result. | ||
| 410 | _mm_storel_pi((__m64*)result, sum); | ||
| 411 | |||
| 412 | // Handle the last element if num_points mod 2 is 1. | ||
| 413 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1u) { |
| 414 | 2 | *result += lv_cmake( | |
| 415 | lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + | ||
| 416 | lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), | ||
| 417 | lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - | ||
| 418 | lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); | ||
| 419 | } | ||
| 420 | 2 | } | |
| 421 | |||
| 422 | #endif /*LV_HAVE_SSE3*/ | ||
| 423 | |||
| 424 | |||
| 425 | #ifdef LV_HAVE_GENERIC | ||
| 426 | |||
| 427 | |||
| 428 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, | |
| 429 | const lv_32fc_t* input, | ||
| 430 | const lv_32fc_t* taps, | ||
| 431 | unsigned int num_points) | ||
| 432 | { | ||
| 433 | |||
| 434 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 435 | |||
| 436 | 2 | float* res = (float*)result; | |
| 437 | 2 | float* in = (float*)input; | |
| 438 | 2 | float* tp = (float*)taps; | |
| 439 | 2 | unsigned int n_2_ccomplex_blocks = num_bytes >> 4; | |
| 440 | |||
| 441 | 2 | float sum0[2] = { 0, 0 }; | |
| 442 | 2 | float sum1[2] = { 0, 0 }; | |
| 443 | 2 | unsigned int i = 0; | |
| 444 | |||
| 445 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
| 446 | 131070 | sum0[0] += in[0] * tp[0] + in[1] * tp[1]; | |
| 447 | 131070 | sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; | |
| 448 | 131070 | sum1[0] += in[2] * tp[2] + in[3] * tp[3]; | |
| 449 | 131070 | sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; | |
| 450 | |||
| 451 | 131070 | in += 4; | |
| 452 | 131070 | tp += 4; | |
| 453 | } | ||
| 454 | |||
| 455 | 2 | res[0] = sum0[0] + sum1[0]; | |
| 456 | 2 | res[1] = sum0[1] + sum1[1]; | |
| 457 | |||
| 458 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 3 & 1) { |
| 459 | 2 | *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); | |
| 460 | } | ||
| 461 | 2 | } | |
| 462 | |||
| 463 | #endif /*LV_HAVE_GENERIC*/ | ||
| 464 | |||
| 465 | |||
| 466 | #if LV_HAVE_SSE && LV_HAVE_64 | ||
| 467 | |||
| 468 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, | |
| 469 | const lv_32fc_t* input, | ||
| 470 | const lv_32fc_t* taps, | ||
| 471 | unsigned int num_points) | ||
| 472 | { | ||
| 473 | |||
| 474 | 2 | const unsigned int num_bytes = num_points * 8; | |
| 475 | |||
| 476 | __VOLK_ATTR_ALIGNED(16) | ||
| 477 | static const uint32_t conjugator[4] = { | ||
| 478 | 0x00000000, 0x80000000, 0x00000000, 0x80000000 | ||
| 479 | }; | ||
| 480 | |||
| 481 | 2 | __VOLK_ASM __VOLK_VOLATILE( | |
| 482 | "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" | ||
| 483 | "# const float *taps, unsigned num_bytes)\n\t" | ||
| 484 | "# float sum0 = 0;\n\t" | ||
| 485 | "# float sum1 = 0;\n\t" | ||
| 486 | "# float sum2 = 0;\n\t" | ||
| 487 | "# float sum3 = 0;\n\t" | ||
| 488 | "# do {\n\t" | ||
| 489 | "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" | ||
| 490 | "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" | ||
| 491 | "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" | ||
| 492 | "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" | ||
| 493 | "# input += 4;\n\t" | ||
| 494 | "# taps += 4; \n\t" | ||
| 495 | "# } while (--n_2_ccomplex_blocks != 0);\n\t" | ||
| 496 | "# result[0] = sum0 + sum2;\n\t" | ||
| 497 | "# result[1] = sum1 + sum3;\n\t" | ||
| 498 | "# TODO: prefetch and better scheduling\n\t" | ||
| 499 | " xor %%r9, %%r9\n\t" | ||
| 500 | " xor %%r10, %%r10\n\t" | ||
| 501 | " movq %[conjugator], %%r9\n\t" | ||
| 502 | " movq %%rcx, %%rax\n\t" | ||
| 503 | " movaps 0(%%r9), %%xmm8\n\t" | ||
| 504 | " movq %%rcx, %%r8\n\t" | ||
| 505 | " movq %[rsi], %%r9\n\t" | ||
| 506 | " movq %[rdx], %%r10\n\t" | ||
| 507 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
| 508 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
| 509 | " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" | ||
| 510 | " shr $4, %%r8\n\t" | ||
| 511 | " xorps %%xmm8, %%xmm2\n\t" | ||
| 512 | " jmp .%=L1_test\n\t" | ||
| 513 | " # 4 taps / loop\n\t" | ||
| 514 | " # something like ?? cycles / loop\n\t" | ||
| 515 | ".%=Loop1: \n\t" | ||
| 516 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
| 517 | "# movaps (%%r9), %%xmmA\n\t" | ||
| 518 | "# movaps (%%r10), %%xmmB\n\t" | ||
| 519 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 520 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
| 521 | "# mulps %%xmmB, %%xmmA\n\t" | ||
| 522 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
| 523 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
| 524 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
| 525 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 526 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
| 527 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
| 528 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
| 529 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
| 530 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
| 531 | "# addps %%xmmZ, %%xmmA\n\t" | ||
| 532 | "# addps %%xmmA, %%xmmC\n\t" | ||
| 533 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
| 534 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
| 535 | " movaps 0(%%r9), %%xmm0\n\t" | ||
| 536 | " movaps 16(%%r9), %%xmm1\n\t" | ||
| 537 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 538 | " movaps 0(%%r10), %%xmm2\n\t" | ||
| 539 | " xorps %%xmm8, %%xmm2\n\t" | ||
| 540 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 541 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 542 | " movaps 16(%%r10), %%xmm3\n\t" | ||
| 543 | " movaps %%xmm1, %%xmm5\n\t" | ||
| 544 | " xorps %%xmm8, %%xmm3\n\t" | ||
| 545 | " addps %%xmm0, %%xmm6\n\t" | ||
| 546 | " mulps %%xmm3, %%xmm1\n\t" | ||
| 547 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
| 548 | " addps %%xmm1, %%xmm6\n\t" | ||
| 549 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 550 | " addps %%xmm2, %%xmm7\n\t" | ||
| 551 | " mulps %%xmm5, %%xmm3\n\t" | ||
| 552 | " add $32, %%r9\n\t" | ||
| 553 | " addps %%xmm3, %%xmm7\n\t" | ||
| 554 | " add $32, %%r10\n\t" | ||
| 555 | ".%=L1_test:\n\t" | ||
| 556 | " dec %%rax\n\t" | ||
| 557 | " jge .%=Loop1\n\t" | ||
| 558 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
| 559 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
| 560 | " # If so, we've got 2 more taps to do.\n\t" | ||
| 561 | " and $1, %%r8\n\t" | ||
| 562 | " je .%=Leven\n\t" | ||
| 563 | " # The count was odd, do 2 more taps.\n\t" | ||
| 564 | " # Note that we've already got mm0/mm2 preloaded\n\t" | ||
| 565 | " # from the main loop.\n\t" | ||
| 566 | " movaps 0(%%r9), %%xmm0\n\t" | ||
| 567 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 568 | " movaps 0(%%r10), %%xmm2\n\t" | ||
| 569 | " xorps %%xmm8, %%xmm2\n\t" | ||
| 570 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 571 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 572 | " addps %%xmm0, %%xmm6\n\t" | ||
| 573 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 574 | " addps %%xmm2, %%xmm7\n\t" | ||
| 575 | ".%=Leven:\n\t" | ||
| 576 | " # neg inversor\n\t" | ||
| 577 | " xorps %%xmm1, %%xmm1\n\t" | ||
| 578 | " mov $0x80000000, %%r9\n\t" | ||
| 579 | " movd %%r9, %%xmm1\n\t" | ||
| 580 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
| 581 | " # pfpnacc\n\t" | ||
| 582 | " xorps %%xmm1, %%xmm6\n\t" | ||
| 583 | " movaps %%xmm6, %%xmm2\n\t" | ||
| 584 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
| 585 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
| 586 | " movaps %%xmm2, %%xmm3\n\t" | ||
| 587 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
| 588 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
| 589 | " addps %%xmm2, %%xmm6\n\t" | ||
| 590 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
| 591 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
| 592 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
| 593 | " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " | ||
| 594 | "to memory\n\t" | ||
| 595 | : | ||
| 596 | : [rsi] "r"(input), | ||
| 597 | [rdx] "r"(taps), | ||
| 598 | "c"(num_bytes), | ||
| 599 | [rdi] "r"(result), | ||
| 600 | [conjugator] "r"(conjugator) | ||
| 601 | : "rax", "r8", "r9", "r10"); | ||
| 602 | |||
| 603 | 2 | int getem = num_bytes % 16; | |
| 604 | |||
| 605 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; getem > 0; getem -= 8) { |
| 606 | 2 | *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); | |
| 607 | } | ||
| 608 | 2 | } | |
| 609 | #endif | ||
| 610 | |||
| 611 | #if LV_HAVE_SSE && LV_HAVE_32 | ||
| 612 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, | ||
| 613 | const lv_32fc_t* input, | ||
| 614 | const lv_32fc_t* taps, | ||
| 615 | unsigned int num_points) | ||
| 616 | { | ||
| 617 | |||
| 618 | const unsigned int num_bytes = num_points * 8; | ||
| 619 | |||
| 620 | __VOLK_ATTR_ALIGNED(16) | ||
| 621 | static const uint32_t conjugator[4] = { | ||
| 622 | 0x00000000, 0x80000000, 0x00000000, 0x80000000 | ||
| 623 | }; | ||
| 624 | |||
| 625 | int bound = num_bytes >> 4; | ||
| 626 | int leftovers = num_bytes % 16; | ||
| 627 | |||
| 628 | __VOLK_ASM __VOLK_VOLATILE( | ||
| 629 | " #pushl %%ebp\n\t" | ||
| 630 | " #movl %%esp, %%ebp\n\t" | ||
| 631 | " #movl 12(%%ebp), %%eax # input\n\t" | ||
| 632 | " #movl 16(%%ebp), %%edx # taps\n\t" | ||
| 633 | " #movl 20(%%ebp), %%ecx # n_bytes\n\t" | ||
| 634 | " movaps 0(%[conjugator]), %%xmm1\n\t" | ||
| 635 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
| 636 | " movaps 0(%[eax]), %%xmm0\n\t" | ||
| 637 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
| 638 | " movaps 0(%[edx]), %%xmm2\n\t" | ||
| 639 | " movl %[ecx], (%[out])\n\t" | ||
| 640 | " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" | ||
| 641 | |||
| 642 | " xorps %%xmm1, %%xmm2\n\t" | ||
| 643 | " jmp .%=L1_test\n\t" | ||
| 644 | " # 4 taps / loop\n\t" | ||
| 645 | " # something like ?? cycles / loop\n\t" | ||
| 646 | ".%=Loop1: \n\t" | ||
| 647 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
| 648 | "# movaps (%[eax]), %%xmmA\n\t" | ||
| 649 | "# movaps (%[edx]), %%xmmB\n\t" | ||
| 650 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 651 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
| 652 | "# mulps %%xmmB, %%xmmA\n\t" | ||
| 653 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
| 654 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
| 655 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
| 656 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
| 657 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
| 658 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
| 659 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
| 660 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
| 661 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
| 662 | "# addps %%xmmZ, %%xmmA\n\t" | ||
| 663 | "# addps %%xmmA, %%xmmC\n\t" | ||
| 664 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
| 665 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
| 666 | " movaps 16(%[edx]), %%xmm3\n\t" | ||
| 667 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 668 | " xorps %%xmm1, %%xmm3\n\t" | ||
| 669 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 670 | " movaps 16(%[eax]), %%xmm1\n\t" | ||
| 671 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 672 | " movaps %%xmm1, %%xmm5\n\t" | ||
| 673 | " addps %%xmm0, %%xmm6\n\t" | ||
| 674 | " mulps %%xmm3, %%xmm1\n\t" | ||
| 675 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
| 676 | " addps %%xmm1, %%xmm6\n\t" | ||
| 677 | " movaps 0(%[conjugator]), %%xmm1\n\t" | ||
| 678 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 679 | " movaps 32(%[eax]), %%xmm0\n\t" | ||
| 680 | " addps %%xmm2, %%xmm7\n\t" | ||
| 681 | " mulps %%xmm5, %%xmm3\n\t" | ||
| 682 | " addl $32, %[eax]\n\t" | ||
| 683 | " movaps 32(%[edx]), %%xmm2\n\t" | ||
| 684 | " addps %%xmm3, %%xmm7\n\t" | ||
| 685 | " xorps %%xmm1, %%xmm2\n\t" | ||
| 686 | " addl $32, %[edx]\n\t" | ||
| 687 | ".%=L1_test:\n\t" | ||
| 688 | " decl %[ecx]\n\t" | ||
| 689 | " jge .%=Loop1\n\t" | ||
| 690 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
| 691 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
| 692 | " # If so, we've got 2 more taps to do.\n\t" | ||
| 693 | " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" | ||
| 694 | " shrl $4, %[ecx]\n\t" | ||
| 695 | " andl $1, %[ecx]\n\t" | ||
| 696 | " je .%=Leven\n\t" | ||
| 697 | " # The count was odd, do 2 more taps.\n\t" | ||
| 698 | " # Note that we've already got mm0/mm2 preloaded\n\t" | ||
| 699 | " # from the main loop.\n\t" | ||
| 700 | " movaps %%xmm0, %%xmm4\n\t" | ||
| 701 | " mulps %%xmm2, %%xmm0\n\t" | ||
| 702 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
| 703 | " addps %%xmm0, %%xmm6\n\t" | ||
| 704 | " mulps %%xmm4, %%xmm2\n\t" | ||
| 705 | " addps %%xmm2, %%xmm7\n\t" | ||
| 706 | ".%=Leven:\n\t" | ||
| 707 | " # neg inversor\n\t" | ||
| 708 | " #movl 8(%%ebp), %[eax] \n\t" | ||
| 709 | " xorps %%xmm1, %%xmm1\n\t" | ||
| 710 | " movl $0x80000000, (%[out])\n\t" | ||
| 711 | " movss (%[out]), %%xmm1\n\t" | ||
| 712 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
| 713 | " # pfpnacc\n\t" | ||
| 714 | " xorps %%xmm1, %%xmm6\n\t" | ||
| 715 | " movaps %%xmm6, %%xmm2\n\t" | ||
| 716 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
| 717 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
| 718 | " movaps %%xmm2, %%xmm3\n\t" | ||
| 719 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
| 720 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
| 721 | " addps %%xmm2, %%xmm6\n\t" | ||
| 722 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
| 723 | " #movl 8(%%ebp), %[eax] # @result\n\t" | ||
| 724 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
| 725 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
| 726 | " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) " | ||
| 727 | "to memory\n\t" | ||
| 728 | " #popl %%ebp\n\t" | ||
| 729 | : | ||
| 730 | : [eax] "r"(input), | ||
| 731 | [edx] "r"(taps), | ||
| 732 | [ecx] "r"(num_bytes), | ||
| 733 | [out] "r"(result), | ||
| 734 | [conjugator] "r"(conjugator)); | ||
| 735 | |||
| 736 | for (; leftovers > 0; leftovers -= 8) { | ||
| 737 | *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); | ||
| 738 | } | ||
| 739 | } | ||
| 740 | #endif /*LV_HAVE_SSE*/ | ||
| 741 | |||
| 742 | |||
| 743 | #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ | ||
| 744 |