GCC Code Coverage Report

Directory:	./
File:	include/volk/volk_sse_intrinsics.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	43	43	100.0%
Functions:	5	5	100.0%
Branches:	0	0	-%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2015 Free Software Foundation, Inc.
    
       * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*
    
       * This file is intended to hold SSE intrinsics of intrinsics.
    
       * They should be used in VOLK kernels to avoid copy-pasta.
    
       */
    
      #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
    
      #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
    
      #include <xmmintrin.h>
    
      /*
    
       * Approximate arctan(x) via polynomial expansion
    
       * on the interval [-1, 1]
    
       *
    
       * Maximum relative error ~6.5e-7
    
       * Polynomial evaluated via Horner's method
    
       */
    
      131068
      static inline __m128 _mm_arctan_poly_sse(const __m128 x)
    
      {
    
      131068
          const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
    
      131068
          const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
    
      131068
          const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
    
      131068
          const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
    
      131068
          const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
    
      131068
          const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
    
      131068
          const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
    
      131068
          const __m128 x_times_x = _mm_mul_ps(x, x);
    
          __m128 arctan;
    
      131068
          arctan = a13;
    
      131068
          arctan = _mm_mul_ps(x_times_x, arctan);
    
      131068
          arctan = _mm_add_ps(arctan, a11);
    
      131068
          arctan = _mm_mul_ps(x_times_x, arctan);
    
      131068
          arctan = _mm_add_ps(arctan, a9);
    
      131068
          arctan = _mm_mul_ps(x_times_x, arctan);
    
      131068
          arctan = _mm_add_ps(arctan, a7);
    
      131068
          arctan = _mm_mul_ps(x_times_x, arctan);
    
      131068
          arctan = _mm_add_ps(arctan, a5);
    
      131068
          arctan = _mm_mul_ps(x_times_x, arctan);
    
      131068
          arctan = _mm_add_ps(arctan, a3);
    
      131068
          arctan = _mm_mul_ps(x_times_x, arctan);
    
      131068
          arctan = _mm_add_ps(arctan, a1);
    
      131068
          arctan = _mm_mul_ps(x, arctan);
    
      131068
          return arctan;
    
      }
    
      393204
      static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
    
      {
    
          __m128 iValue, qValue;
    
          // Arrange in i1i2i3i4 format
    
      393204
          iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
    
          // Arrange in q1q2q3q4 format
    
      393204
          qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
    
      393204
          iValue = _mm_mul_ps(iValue, iValue); // Square the I values
    
      393204
          qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
    
      393204
          return _mm_add_ps(iValue, qValue);   // Add the I2 and Q2 values
    
      }
    
      131068
      static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
    
      {
    
      262136
          return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
    
      }
    
      131068
      static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
    
                                                       const __m128 symbols1,
    
                                                       const __m128 points0,
    
                                                       const __m128 points1,
    
                                                       const __m128 scalar)
    
      {
    
          // calculate scalar * |x - y|^2
    
      131068
          const __m128 diff0 = _mm_sub_ps(symbols0, points0);
    
      131068
          const __m128 diff1 = _mm_sub_ps(symbols1, points1);
    
      131068
          const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
    
      131068
          return _mm_mul_ps(norms, scalar);
    
      }
    
      131056
      static inline __m128 _mm_accumulate_square_sum_ps(
    
          __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
    
      {
    
      131056
          aux = _mm_mul_ps(aux, val);
    
      131056
          aux = _mm_sub_ps(aux, acc);
    
      131056
          aux = _mm_mul_ps(aux, aux);
    
      131056
          aux = _mm_mul_ps(aux, rec);
    
      131056
          return _mm_add_ps(sq_acc, aux);
    
      }
    
      #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */

Line	Exec	Source
1		/* -- c++ -- */
2		/*
3		* Copyright 2015 Free Software Foundation, Inc.
4		* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5		*
6		* This file is part of VOLK
7		*
8		* SPDX-License-Identifier: LGPL-3.0-or-later
9		*/
10
11		/*
12		* This file is intended to hold SSE intrinsics of intrinsics.
13		* They should be used in VOLK kernels to avoid copy-pasta.
14		*/
15
16		#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17		#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18		#include <xmmintrin.h>
19
20		/*
21		* Approximate arctan(x) via polynomial expansion
22		* on the interval [-1, 1]
23		*
24		* Maximum relative error ~6.5e-7
25		* Polynomial evaluated via Horner's method
26		*/
27	131068	static inline __m128 _mm_arctan_poly_sse(const __m128 x)
28		{
29	131068	const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
30	131068	const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
31	131068	const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
32	131068	const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
33	131068	const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
34	131068	const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
35	131068	const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
36
37	131068	const __m128 x_times_x = _mm_mul_ps(x, x);
38		__m128 arctan;
39	131068	arctan = a13;
40	131068	arctan = _mm_mul_ps(x_times_x, arctan);
41	131068	arctan = _mm_add_ps(arctan, a11);
42	131068	arctan = _mm_mul_ps(x_times_x, arctan);
43	131068	arctan = _mm_add_ps(arctan, a9);
44	131068	arctan = _mm_mul_ps(x_times_x, arctan);
45	131068	arctan = _mm_add_ps(arctan, a7);
46	131068	arctan = _mm_mul_ps(x_times_x, arctan);
47	131068	arctan = _mm_add_ps(arctan, a5);
48	131068	arctan = _mm_mul_ps(x_times_x, arctan);
49	131068	arctan = _mm_add_ps(arctan, a3);
50	131068	arctan = _mm_mul_ps(x_times_x, arctan);
51	131068	arctan = _mm_add_ps(arctan, a1);
52	131068	arctan = _mm_mul_ps(x, arctan);
53
54	131068	return arctan;
55		}
56
57	393204	static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
58		{
59		__m128 iValue, qValue;
60		// Arrange in i1i2i3i4 format
61	393204	iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
62		// Arrange in q1q2q3q4 format
63	393204	qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
64	393204	iValue = _mm_mul_ps(iValue, iValue); // Square the I values
65	393204	qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
66	393204	return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
67		}
68
69	131068	static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
70		{
71	262136	return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
72		}
73
74	131068	static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
75		const __m128 symbols1,
76		const __m128 points0,
77		const __m128 points1,
78		const __m128 scalar)
79		{
80		// calculate scalar * \|x - y\|^2
81	131068	const __m128 diff0 = _mm_sub_ps(symbols0, points0);
82	131068	const __m128 diff1 = _mm_sub_ps(symbols1, points1);
83	131068	const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
84	131068	return _mm_mul_ps(norms, scalar);
85		}
86
87	131056	static inline __m128 _mm_accumulate_square_sum_ps(
88		__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
89		{
90	131056	aux = _mm_mul_ps(aux, val);
91	131056	aux = _mm_sub_ps(aux, acc);
92	131056	aux = _mm_mul_ps(aux, aux);
93	131056	aux = _mm_mul_ps(aux, rec);
94	131056	return _mm_add_ps(sq_acc, aux);
95		}
96
97		#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
98