Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2015 Free Software Foundation, Inc. |
4 |
|
|
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> |
5 |
|
|
* |
6 |
|
|
* This file is part of VOLK |
7 |
|
|
* |
8 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
9 |
|
|
*/ |
10 |
|
|
|
11 |
|
|
/* |
12 |
|
|
* This file is intended to hold SSE intrinsics of intrinsics. |
13 |
|
|
* They should be used in VOLK kernels to avoid copy-pasta. |
14 |
|
|
*/ |
15 |
|
|
|
16 |
|
|
#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ |
17 |
|
|
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ |
18 |
|
|
#include <xmmintrin.h> |
19 |
|
|
|
20 |
|
|
/* |
21 |
|
|
* Approximate arctan(x) via polynomial expansion |
22 |
|
|
* on the interval [-1, 1] |
23 |
|
|
* |
24 |
|
|
* Maximum relative error ~6.5e-7 |
25 |
|
|
* Polynomial evaluated via Horner's method |
26 |
|
|
*/ |
27 |
|
131068 |
static inline __m128 _mm_arctan_poly_sse(const __m128 x) |
28 |
|
|
{ |
29 |
|
131068 |
const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f); |
30 |
|
131068 |
const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f); |
31 |
|
131068 |
const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f); |
32 |
|
131068 |
const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f); |
33 |
|
131068 |
const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f); |
34 |
|
131068 |
const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f); |
35 |
|
131068 |
const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f); |
36 |
|
|
|
37 |
|
131068 |
const __m128 x_times_x = _mm_mul_ps(x, x); |
38 |
|
|
__m128 arctan; |
39 |
|
131068 |
arctan = a13; |
40 |
|
131068 |
arctan = _mm_mul_ps(x_times_x, arctan); |
41 |
|
131068 |
arctan = _mm_add_ps(arctan, a11); |
42 |
|
131068 |
arctan = _mm_mul_ps(x_times_x, arctan); |
43 |
|
131068 |
arctan = _mm_add_ps(arctan, a9); |
44 |
|
131068 |
arctan = _mm_mul_ps(x_times_x, arctan); |
45 |
|
131068 |
arctan = _mm_add_ps(arctan, a7); |
46 |
|
131068 |
arctan = _mm_mul_ps(x_times_x, arctan); |
47 |
|
131068 |
arctan = _mm_add_ps(arctan, a5); |
48 |
|
131068 |
arctan = _mm_mul_ps(x_times_x, arctan); |
49 |
|
131068 |
arctan = _mm_add_ps(arctan, a3); |
50 |
|
131068 |
arctan = _mm_mul_ps(x_times_x, arctan); |
51 |
|
131068 |
arctan = _mm_add_ps(arctan, a1); |
52 |
|
131068 |
arctan = _mm_mul_ps(x, arctan); |
53 |
|
|
|
54 |
|
131068 |
return arctan; |
55 |
|
|
} |
56 |
|
|
|
57 |
|
393204 |
static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) |
58 |
|
|
{ |
59 |
|
|
__m128 iValue, qValue; |
60 |
|
|
// Arrange in i1i2i3i4 format |
61 |
|
393204 |
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); |
62 |
|
|
// Arrange in q1q2q3q4 format |
63 |
|
393204 |
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); |
64 |
|
393204 |
iValue = _mm_mul_ps(iValue, iValue); // Square the I values |
65 |
|
393204 |
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values |
66 |
|
393204 |
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values |
67 |
|
|
} |
68 |
|
|
|
69 |
|
131068 |
static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) |
70 |
|
|
{ |
71 |
|
262136 |
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); |
72 |
|
|
} |
73 |
|
|
|
74 |
|
131068 |
static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, |
75 |
|
|
const __m128 symbols1, |
76 |
|
|
const __m128 points0, |
77 |
|
|
const __m128 points1, |
78 |
|
|
const __m128 scalar) |
79 |
|
|
{ |
80 |
|
|
// calculate scalar * |x - y|^2 |
81 |
|
131068 |
const __m128 diff0 = _mm_sub_ps(symbols0, points0); |
82 |
|
131068 |
const __m128 diff1 = _mm_sub_ps(symbols1, points1); |
83 |
|
131068 |
const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); |
84 |
|
131068 |
return _mm_mul_ps(norms, scalar); |
85 |
|
|
} |
86 |
|
|
|
87 |
|
131056 |
static inline __m128 _mm_accumulate_square_sum_ps( |
88 |
|
|
__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux) |
89 |
|
|
{ |
90 |
|
131056 |
aux = _mm_mul_ps(aux, val); |
91 |
|
131056 |
aux = _mm_sub_ps(aux, acc); |
92 |
|
131056 |
aux = _mm_mul_ps(aux, aux); |
93 |
|
131056 |
aux = _mm_mul_ps(aux, rec); |
94 |
|
131056 |
return _mm_add_ps(sq_acc, aux); |
95 |
|
|
} |
96 |
|
|
|
97 |
|
|
#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */ |
98 |
|
|
|