Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2015 Free Software Foundation, Inc. |
4 |
|
|
* |
5 |
|
|
* This file is part of VOLK |
6 |
|
|
* |
7 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
/* |
11 |
|
|
* This file is intended to hold SSE3 intrinsics of intrinsics. |
12 |
|
|
* They should be used in VOLK kernels to avoid copy-pasta. |
13 |
|
|
*/ |
14 |
|
|
|
15 |
|
|
#ifndef INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ |
16 |
|
|
#define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ |
17 |
|
|
#include <pmmintrin.h> |
18 |
|
|
|
19 |
|
1048556 |
static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) |
20 |
|
|
{ |
21 |
|
|
__m128 yl, yh, tmp1, tmp2; |
22 |
|
1048556 |
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr |
23 |
|
1048556 |
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di |
24 |
|
1048556 |
tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |
25 |
|
1048556 |
x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br |
26 |
|
1048556 |
tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |
27 |
|
1048556 |
return _mm_addsub_ps(tmp1, |
28 |
|
|
tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |
29 |
|
|
} |
30 |
|
|
|
31 |
|
786416 |
static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) |
32 |
|
|
{ |
33 |
|
786416 |
const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); |
34 |
|
786416 |
y = _mm_xor_ps(y, conjugator); // conjugate y |
35 |
|
786416 |
return _mm_complexmul_ps(x, y); |
36 |
|
|
} |
37 |
|
|
|
38 |
|
524272 |
static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) |
39 |
|
|
{ |
40 |
|
524272 |
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values |
41 |
|
524272 |
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values |
42 |
|
524272 |
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values |
43 |
|
|
} |
44 |
|
|
|
45 |
|
131068 |
static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) |
46 |
|
|
{ |
47 |
|
262136 |
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); |
48 |
|
|
} |
49 |
|
|
|
50 |
|
131068 |
static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, |
51 |
|
|
const __m128 symbols1, |
52 |
|
|
const __m128 points0, |
53 |
|
|
const __m128 points1, |
54 |
|
|
const __m128 scalar) |
55 |
|
|
{ |
56 |
|
|
/* |
57 |
|
|
* Calculate: |y - x|^2 * SNR_lin |
58 |
|
|
* Consider 'symbolsX' and 'pointsX' to be complex float |
59 |
|
|
* 'symbolsX' are 'y' and 'pointsX' are 'x' |
60 |
|
|
*/ |
61 |
|
131068 |
const __m128 diff0 = _mm_sub_ps(symbols0, points0); |
62 |
|
131068 |
const __m128 diff1 = _mm_sub_ps(symbols1, points1); |
63 |
|
131068 |
const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); |
64 |
|
131068 |
return _mm_mul_ps(norms, scalar); |
65 |
|
|
} |
66 |
|
|
|
67 |
|
|
#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ |
68 |
|
|
|