GCC Code Coverage Report

Directory:	./
File:	include/volk/volk_avx_intrinsics.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	96	96	100.0%
Functions:	12	12	100.0%
Branches:	0	0	-%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2015 Free Software Foundation, Inc.
    
       * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*
    
       * This file is intended to hold AVX intrinsics of intrinsics.
    
       * They should be used in VOLK kernels to avoid copy-pasta.
    
       */
    
      #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
    
      #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
    
      #include <immintrin.h>
    
      /*
    
       * Approximate arctan(x) via polynomial expansion
    
       * on the interval [-1, 1]
    
       *
    
       * Maximum relative error ~6.5e-7
    
       * Polynomial evaluated via Horner's method
    
       */
    
      65532
      static inline __m256 _m256_arctan_poly_avx(const __m256 x)
    
      {
    
      65532
          const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
    
      65532
          const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
    
      65532
          const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
    
      65532
          const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
    
      65532
          const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
    
      65532
          const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
    
      65532
          const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
    
      65532
          const __m256 x_times_x = _mm256_mul_ps(x, x);
    
          __m256 arctan;
    
      65532
          arctan = a13;
    
      65532
          arctan = _mm256_mul_ps(x_times_x, arctan);
    
      65532
          arctan = _mm256_add_ps(arctan, a11);
    
      65532
          arctan = _mm256_mul_ps(x_times_x, arctan);
    
      65532
          arctan = _mm256_add_ps(arctan, a9);
    
      65532
          arctan = _mm256_mul_ps(x_times_x, arctan);
    
      65532
          arctan = _mm256_add_ps(arctan, a7);
    
      65532
          arctan = _mm256_mul_ps(x_times_x, arctan);
    
      65532
          arctan = _mm256_add_ps(arctan, a5);
    
      65532
          arctan = _mm256_mul_ps(x_times_x, arctan);
    
      65532
          arctan = _mm256_add_ps(arctan, a3);
    
      65532
          arctan = _mm256_mul_ps(x_times_x, arctan);
    
      65532
          arctan = _mm256_add_ps(arctan, a1);
    
      65532
          arctan = _mm256_mul_ps(x, arctan);
    
      65532
          return arctan;
    
      }
    
      393204
      static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
    
      {
    
          __m256 yl, yh, tmp1, tmp2;
    
      393204
          yl = _mm256_moveldup_ps(y);        // Load yl with cr,cr,dr,dr ...
    
      393204
          yh = _mm256_movehdup_ps(y);        // Load yh with ci,ci,di,di ...
    
      393204
          tmp1 = _mm256_mul_ps(x, yl);       // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
    
      393204
          x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
    
      393204
          tmp2 = _mm256_mul_ps(x, yh);       // tmp2 = ai*ci,ar*ci,bi*di,br*di
    
          // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
    
      393204
          return _mm256_addsub_ps(tmp1, tmp2);
    
      }
    
      static inline __m256 _mm256_conjugate_ps(__m256 x)
    
      {
    
          const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
    
          return _mm256_xor_ps(x, conjugator); // conjugate y
    
      }
    
      393202
      static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
    
      {
    
      393202
          const __m256 nswap = _mm256_permute_ps(x, 0xb1);
    
      393202
          const __m256 dreal = _mm256_moveldup_ps(y);
    
      393202
          const __m256 dimag = _mm256_movehdup_ps(y);
    
      393202
          const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
    
      393202
          const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
    
      393202
          const __m256 multreal = _mm256_mul_ps(x, dreal);
    
      393202
          const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
    
      393202
          return _mm256_add_ps(multreal, multimag);
    
      }
    
      1024
      static inline __m256 _mm256_normalize_ps(__m256 val)
    
      {
    
      1024
          __m256 tmp1 = _mm256_mul_ps(val, val);
    
      1024
          tmp1 = _mm256_hadd_ps(tmp1, tmp1);
    
      1024
          tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
    
      1024
          tmp1 = _mm256_sqrt_ps(tmp1);
    
      1024
          return _mm256_div_ps(val, tmp1);
    
      }
    
      229362
      static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
    
      {
    
          __m256 complex1, complex2;
    
      229362
          cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
    
      229362
          cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
    
      229362
          complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
    
      229362
          complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
    
      229362
          return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
    
      }
    
      65532
      static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
    
      {
    
      131064
          return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
    
      }
    
      65532
      static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
    
                                                      const __m256 symbols1,
    
                                                      const __m256 points0,
    
                                                      const __m256 points1,
    
                                                      const __m256 scalar)
    
      {
    
          /*
    
           * Calculate: |y - x|^2 * SNR_lin
    
           * Consider 'symbolsX' and 'pointsX' to be complex float
    
           * 'symbolsX' are 'y' and 'pointsX' are 'x'
    
           */
    
      65532
          const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
    
      65532
          const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
    
      65532
          const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
    
      65532
          return _mm256_mul_ps(norms, scalar);
    
      }
    
      4608
      static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
    
      {
    
      4608
          __m256 sign_mask_dummy = _mm256_setzero_ps();
    
      4608
          const __m128i zeros = _mm_set1_epi8(0x00);
    
      4608
          const __m128i sign_extract = _mm_set1_epi8(0x80);
    
      4608
          const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x00,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x01,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x02,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x03);
    
      4608
          const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x04,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x05,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x06,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0xff,
    
                                                      0x07);
    
      4608
          fbits = _mm_cmpgt_epi8(fbits, zeros);
    
      4608
          fbits = _mm_and_si128(fbits, sign_extract);
    
      4608
          __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
    
      4608
          __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
    
          __m256 sign_mask =
    
      4608
              _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
    
      4608
          return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
    
          //  // This is the desired function call. Though it seems to be missing in GCC.
    
          //  // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
    
          //  return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
    
          //  _mm_castsi128_ps(sign_bits0));
    
      }
    
      static inline void
    
      18432
      _mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
    
      {
    
          // deinterleave values
    
      18432
          __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
    
      18432
          __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
    
      18432
          *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
    
      18432
          *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
    
      18432
      }
    
      9216
      static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
    
      {
    
      9216
          const __m256 sign_mask = _mm256_set1_ps(-0.0f);
    
          const __m256 abs_mask =
    
      18432
              _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
    
          __m256 llr0, llr1;
    
      9216
          _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
    
          // calculate result
    
          __m256 sign =
    
      27648
              _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
    
          __m256 dst =
    
      36864
              _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
    
      9216
          return _mm256_or_ps(dst, sign);
    
      }
    
      4608
      static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
    
      {
    
          // prepare sign mask for correct +-
    
      4608
          __m256 sign_mask = _mm256_polar_sign_mask(fbits);
    
          __m256 llr0, llr1;
    
      4608
          _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
    
          // calculate result
    
      4608
          llr0 = _mm256_xor_ps(llr0, sign_mask);
    
      4608
          __m256 dst = _mm256_add_ps(llr0, llr1);
    
      4608
          return dst;
    
      }
    
      65520
      static inline __m256 _mm256_accumulate_square_sum_ps(
    
          __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
    
      {
    
      65520
          aux = _mm256_mul_ps(aux, val);
    
      65520
          aux = _mm256_sub_ps(aux, acc);
    
      65520
          aux = _mm256_mul_ps(aux, aux);
    
      65520
          aux = _mm256_mul_ps(aux, rec);
    
      65520
          return _mm256_add_ps(sq_acc, aux);
    
      }
    
      #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */

Line	Exec	Source
1		/* -- c++ -- */
2		/*
3		* Copyright 2015 Free Software Foundation, Inc.
4		* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5		*
6		* This file is part of VOLK
7		*
8		* SPDX-License-Identifier: LGPL-3.0-or-later
9		*/
10
11		/*
12		* This file is intended to hold AVX intrinsics of intrinsics.
13		* They should be used in VOLK kernels to avoid copy-pasta.
14		*/
15
16		#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17		#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
18		#include <immintrin.h>
19
20		/*
21		* Approximate arctan(x) via polynomial expansion
22		* on the interval [-1, 1]
23		*
24		* Maximum relative error ~6.5e-7
25		* Polynomial evaluated via Horner's method
26		*/
27	65532	static inline __m256 _m256_arctan_poly_avx(const __m256 x)
28		{
29	65532	const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30	65532	const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31	65532	const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32	65532	const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33	65532	const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34	65532	const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35	65532	const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
36
37	65532	const __m256 x_times_x = _mm256_mul_ps(x, x);
38		__m256 arctan;
39	65532	arctan = a13;
40	65532	arctan = _mm256_mul_ps(x_times_x, arctan);
41	65532	arctan = _mm256_add_ps(arctan, a11);
42	65532	arctan = _mm256_mul_ps(x_times_x, arctan);
43	65532	arctan = _mm256_add_ps(arctan, a9);
44	65532	arctan = _mm256_mul_ps(x_times_x, arctan);
45	65532	arctan = _mm256_add_ps(arctan, a7);
46	65532	arctan = _mm256_mul_ps(x_times_x, arctan);
47	65532	arctan = _mm256_add_ps(arctan, a5);
48	65532	arctan = _mm256_mul_ps(x_times_x, arctan);
49	65532	arctan = _mm256_add_ps(arctan, a3);
50	65532	arctan = _mm256_mul_ps(x_times_x, arctan);
51	65532	arctan = _mm256_add_ps(arctan, a1);
52	65532	arctan = _mm256_mul_ps(x, arctan);
53
54	65532	return arctan;
55		}
56
57	393204	static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
58		{
59		__m256 yl, yh, tmp1, tmp2;
60	393204	yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
61	393204	yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
62	393204	tmp1 = _mm256_mul_ps(x, yl); // tmp1 = arcr,aicr,brdr,bidr ...
63	393204	x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
64	393204	tmp2 = _mm256_mul_ps(x, yh); // tmp2 = aici,arci,bidi,brdi
65
66		// arcr-aici, aicr+arci, brdr-bidi, bidr+brdi
67	393204	return _mm256_addsub_ps(tmp1, tmp2);
68		}
69
70		static inline __m256 _mm256_conjugate_ps(__m256 x)
71		{
72		const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73		return _mm256_xor_ps(x, conjugator); // conjugate y
74		}
75
76	393202	static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
77		{
78	393202	const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79	393202	const __m256 dreal = _mm256_moveldup_ps(y);
80	393202	const __m256 dimag = _mm256_movehdup_ps(y);
81
82	393202	const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83	393202	const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84	393202	const __m256 multreal = _mm256_mul_ps(x, dreal);
85	393202	const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86	393202	return _mm256_add_ps(multreal, multimag);
87		}
88
89	1024	static inline __m256 _mm256_normalize_ps(__m256 val)
90		{
91	1024	__m256 tmp1 = _mm256_mul_ps(val, val);
92	1024	tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93	1024	tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
94	1024	tmp1 = _mm256_sqrt_ps(tmp1);
95	1024	return _mm256_div_ps(val, tmp1);
96		}
97
98	229362	static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
99		{
100		__m256 complex1, complex2;
101	229362	cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
102	229362	cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
103	229362	complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104	229362	complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105	229362	return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
106		}
107
108	65532	static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
109		{
110	131064	return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
111		}
112
113	65532	static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
114		const __m256 symbols1,
115		const __m256 points0,
116		const __m256 points1,
117		const __m256 scalar)
118		{
119		/*
120		* Calculate: \|y - x\|^2 * SNR_lin
121		* Consider 'symbolsX' and 'pointsX' to be complex float
122		* 'symbolsX' are 'y' and 'pointsX' are 'x'
123		*/
124	65532	const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125	65532	const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
126	65532	const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
127	65532	return _mm256_mul_ps(norms, scalar);
128		}
129
130	4608	static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
131		{
132	4608	__m256 sign_mask_dummy = _mm256_setzero_ps();
133	4608	const __m128i zeros = _mm_set1_epi8(0x00);
134	4608	const __m128i sign_extract = _mm_set1_epi8(0x80);
135	4608	const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
136		0xff,
137		0xff,
138		0x00,
139		0xff,
140		0xff,
141		0xff,
142		0x01,
143		0xff,
144		0xff,
145		0xff,
146		0x02,
147		0xff,
148		0xff,
149		0xff,
150		0x03);
151	4608	const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
152		0xff,
153		0xff,
154		0x04,
155		0xff,
156		0xff,
157		0xff,
158		0x05,
159		0xff,
160		0xff,
161		0xff,
162		0x06,
163		0xff,
164		0xff,
165		0xff,
166		0x07);
167
168	4608	fbits = _mm_cmpgt_epi8(fbits, zeros);
169	4608	fbits = _mm_and_si128(fbits, sign_extract);
170	4608	__m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
171	4608	__m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
172
173		__m256 sign_mask =
174	4608	_mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
175	4608	return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
176		// // This is the desired function call. Though it seems to be missing in GCC.
177		// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
178		// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
179		// _mm_castsi128_ps(sign_bits0));
180		}
181
182		static inline void
183	18432	_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
184		{
185		// deinterleave values
186	18432	__m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187	18432	__m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188	18432	*llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189	18432	*llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
190	18432	}
191
192	9216	static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
193		{
194	9216	const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195		const __m256 abs_mask =
196	18432	_mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
197
198		__m256 llr0, llr1;
199	9216	_mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
200
201		// calculate result
202		__m256 sign =
203	27648	_mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
204		__m256 dst =
205	36864	_mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206	9216	return _mm256_or_ps(dst, sign);
207		}
208
209	4608	static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
210		{
211		// prepare sign mask for correct +-
212	4608	__m256 sign_mask = _mm256_polar_sign_mask(fbits);
213
214		__m256 llr0, llr1;
215	4608	_mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
216
217		// calculate result
218	4608	llr0 = _mm256_xor_ps(llr0, sign_mask);
219	4608	__m256 dst = _mm256_add_ps(llr0, llr1);
220	4608	return dst;
221		}
222
223	65520	static inline __m256 _mm256_accumulate_square_sum_ps(
224		__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
225		{
226	65520	aux = _mm256_mul_ps(aux, val);
227	65520	aux = _mm256_sub_ps(aux, acc);
228	65520	aux = _mm256_mul_ps(aux, aux);
229	65520	aux = _mm256_mul_ps(aux, rec);
230	65520	return _mm256_add_ps(sq_acc, aux);
231		}
232
233		#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
234