GCC Code Coverage Report


Directory: ./
File: include/volk/volk_sse_intrinsics.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 43 43 100.0%
Functions: 5 5 100.0%
Branches: 0 0 -%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11 /*
12 * This file is intended to hold SSE intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-pasta.
14 */
15
16 #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18 #include <xmmintrin.h>
19
20 /*
21 * Approximate arctan(x) via polynomial expansion
22 * on the interval [-1, 1]
23 *
24 * Maximum relative error ~6.5e-7
25 * Polynomial evaluated via Horner's method
26 */
27 131068 static inline __m128 _mm_arctan_poly_sse(const __m128 x)
28 {
29 131068 const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
30 131068 const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
31 131068 const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
32 131068 const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
33 131068 const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
34 131068 const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
35 131068 const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
36
37 131068 const __m128 x_times_x = _mm_mul_ps(x, x);
38 __m128 arctan;
39 131068 arctan = a13;
40 131068 arctan = _mm_mul_ps(x_times_x, arctan);
41 131068 arctan = _mm_add_ps(arctan, a11);
42 131068 arctan = _mm_mul_ps(x_times_x, arctan);
43 131068 arctan = _mm_add_ps(arctan, a9);
44 131068 arctan = _mm_mul_ps(x_times_x, arctan);
45 131068 arctan = _mm_add_ps(arctan, a7);
46 131068 arctan = _mm_mul_ps(x_times_x, arctan);
47 131068 arctan = _mm_add_ps(arctan, a5);
48 131068 arctan = _mm_mul_ps(x_times_x, arctan);
49 131068 arctan = _mm_add_ps(arctan, a3);
50 131068 arctan = _mm_mul_ps(x_times_x, arctan);
51 131068 arctan = _mm_add_ps(arctan, a1);
52 131068 arctan = _mm_mul_ps(x, arctan);
53
54 131068 return arctan;
55 }
56
57 393204 static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
58 {
59 __m128 iValue, qValue;
60 // Arrange in i1i2i3i4 format
61 393204 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
62 // Arrange in q1q2q3q4 format
63 393204 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
64 393204 iValue = _mm_mul_ps(iValue, iValue); // Square the I values
65 393204 qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
66 393204 return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
67 }
68
69 131068 static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
70 {
71 262136 return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
72 }
73
74 131068 static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
75 const __m128 symbols1,
76 const __m128 points0,
77 const __m128 points1,
78 const __m128 scalar)
79 {
80 // calculate scalar * |x - y|^2
81 131068 const __m128 diff0 = _mm_sub_ps(symbols0, points0);
82 131068 const __m128 diff1 = _mm_sub_ps(symbols1, points1);
83 131068 const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
84 131068 return _mm_mul_ps(norms, scalar);
85 }
86
87 131056 static inline __m128 _mm_accumulate_square_sum_ps(
88 __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
89 {
90 131056 aux = _mm_mul_ps(aux, val);
91 131056 aux = _mm_sub_ps(aux, acc);
92 131056 aux = _mm_mul_ps(aux, aux);
93 131056 aux = _mm_mul_ps(aux, rec);
94 131056 return _mm_add_ps(sq_acc, aux);
95 }
96
97 #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
98