GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_atan_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	142	142	100.0%
Functions:	8	8	100.0%
Branches:	28	28	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2014 Free Software Foundation, Inc.
    
       * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_atan_32f
    
       *
    
       * \b Overview
    
       *
    
       * Computes arcsine of input vector and stores results in output vector.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
    
       * \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The input vector of floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li bVector: The vector where results will be stored.
    
       *
    
       * \b Example
    
       * Calculate common angles around the top half of the unit circle.
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   in[0] = 0.f;
    
       *   in[1] = 1.f/std::sqrt(3.f);
    
       *   in[2] = 1.f;
    
       *   in[3] = std::sqrt(3.f);
    
       *   in[4] = in[5] = 1e99;
    
       *   for(unsigned int ii = 6; ii < N; ++ii){
    
       *       in[ii] = - in[N-ii-1];
    
       *   }
    
       *
    
       *   volk_32f_atan_32f(out, in, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]);
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #include <math.h>
    
      #ifndef INCLUDED_volk_32f_atan_32f_a_H
    
      #define INCLUDED_volk_32f_atan_32f_a_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      #include <volk/volk_avx2_fma_intrinsics.h>
    
      static inline void
    
      2
      volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          const __m256 one = _mm256_set1_ps(1.f);
    
      2
          const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
    
      4
          const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    
      2
          const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighth_points = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighth_points; number++) {
    
      32766
              __m256 x = _mm256_load_ps(in);
    
      32766
              __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
    
      65532
              __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
    
                                            _mm256_blendv_ps(one, x, swap_mask));
    
      32766
              __m256 result = _m256_arctan_poly_avx2_fma(x_star);
    
      32766
              __m256 term = _mm256_and_ps(x_star, sign_mask);
    
      32766
              term = _mm256_or_ps(pi_over_2, term);
    
      32766
              term = _mm256_sub_ps(term, result);
    
      32766
              result = _mm256_blendv_ps(result, term, swap_mask);
    
              _mm256_store_ps(out, result);
    
      32766
              in += 8;
    
      32766
              out += 8;
    
          }
    
      2
          number = eighth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
    
      #if LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      static inline void
    
      2
      volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          const __m256 one = _mm256_set1_ps(1.f);
    
      2
          const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
    
      4
          const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    
      2
          const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighth_points = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighth_points; number++) {
    
      32766
              __m256 x = _mm256_load_ps(in);
    
      32766
              __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
    
      65532
              __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
    
                                            _mm256_blendv_ps(one, x, swap_mask));
    
      32766
              __m256 result = _m256_arctan_poly_avx(x_star);
    
      32766
              __m256 term = _mm256_and_ps(x_star, sign_mask);
    
      32766
              term = _mm256_or_ps(pi_over_2, term);
    
      32766
              term = _mm256_sub_ps(term, result);
    
      32766
              result = _mm256_blendv_ps(result, term, swap_mask);
    
              _mm256_store_ps(out, result);
    
      32766
              in += 8;
    
      32766
              out += 8;
    
          }
    
      2
          number = eighth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      #include <volk/volk_sse_intrinsics.h>
    
      static inline void
    
      2
      volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          const __m128 one = _mm_set1_ps(1.f);
    
      2
          const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
    
      4
          const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    
      2
          const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarter_points = num_points / 4;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarter_points; number++) {
    
      65534
              __m128 x = _mm_load_ps(in);
    
      131068
              __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
    
      131068
              __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
    
                                         _mm_blendv_ps(one, x, swap_mask));
    
      65534
              __m128 result = _mm_arctan_poly_sse(x_star);
    
      65534
              __m128 term = _mm_and_ps(x_star, sign_mask);
    
      65534
              term = _mm_or_ps(pi_over_2, term);
    
      65534
              term = _mm_sub_ps(term, result);
    
      65534
              result = _mm_blendv_ps(result, term, swap_mask);
    
              _mm_store_ps(out, result);
    
      65534
              in += 4;
    
      65534
              out += 4;
    
          }
    
      2
          number = quarter_points * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #endif /* INCLUDED_volk_32f_atan_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_atan_32f_u_H
    
      #define INCLUDED_volk_32f_atan_32f_u_H
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          const __m256 one = _mm256_set1_ps(1.f);
    
      2
          const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
    
      4
          const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    
      2
          const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighth_points = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighth_points; number++) {
    
      32766
              __m256 x = _mm256_loadu_ps(in);
    
      32766
              __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
    
      65532
              __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
    
                                            _mm256_blendv_ps(one, x, swap_mask));
    
      32766
              __m256 result = _m256_arctan_poly_avx2_fma(x_star);
    
      32766
              __m256 term = _mm256_and_ps(x_star, sign_mask);
    
      32766
              term = _mm256_or_ps(pi_over_2, term);
    
      32766
              term = _mm256_sub_ps(term, result);
    
      32766
              result = _mm256_blendv_ps(result, term, swap_mask);
    
              _mm256_storeu_ps(out, result);
    
      32766
              in += 8;
    
      32766
              out += 8;
    
          }
    
      2
          number = eighth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
    
      #if LV_HAVE_AVX
    
      #include <immintrin.h>
    
      static inline void
    
      2
      volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          const __m256 one = _mm256_set1_ps(1.f);
    
      2
          const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
    
      4
          const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    
      2
          const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
    
      2
          unsigned int number = 0;
    
      2
          unsigned int eighth_points = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighth_points; number++) {
    
      32766
              __m256 x = _mm256_loadu_ps(in);
    
      32766
              __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
    
      65532
              __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
    
                                            _mm256_blendv_ps(one, x, swap_mask));
    
      32766
              __m256 result = _m256_arctan_poly_avx(x_star);
    
      32766
              __m256 term = _mm256_and_ps(x_star, sign_mask);
    
      32766
              term = _mm256_or_ps(pi_over_2, term);
    
      32766
              term = _mm256_sub_ps(term, result);
    
      32766
              result = _mm256_blendv_ps(result, term, swap_mask);
    
              _mm256_storeu_ps(out, result);
    
      32766
              in += 8;
    
      32766
              out += 8;
    
          }
    
      2
          number = eighth_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX for unaligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      #include <volk/volk_sse_intrinsics.h>
    
      static inline void
    
      2
      volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          const __m128 one = _mm_set1_ps(1.f);
    
      2
          const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
    
      4
          const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    
      2
          const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
    
      2
          unsigned int number = 0;
    
      2
          unsigned int quarter_points = num_points / 4;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarter_points; number++) {
    
      65534
              __m128 x = _mm_loadu_ps(in);
    
      131068
              __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
    
      131068
              __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
    
                                         _mm_blendv_ps(one, x, swap_mask));
    
      65534
              __m128 result = _mm_arctan_poly_sse(x_star);
    
      65534
              __m128 term = _mm_and_ps(x_star, sign_mask);
    
      65534
              term = _mm_or_ps(pi_over_2, term);
    
      65534
              term = _mm_sub_ps(term, result);
    
      65534
              result = _mm_blendv_ps(result, term, swap_mask);
    
              _mm_storeu_ps(out, result);
    
      65534
              in += 4;
    
      65534
              out += 4;
    
          }
    
      2
          number = quarter_points * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for unaligned */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; number++) {
    
      262142
              *out++ = volk_arctan(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_GENERIC
    
      static inline void
    
      2
      volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (; number < num_points; number++) {
    
      262142
              *out++ = atanf(*in++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32f_atan_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2014 Free Software Foundation, Inc.
4			* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5			*
6			* This file is part of VOLK
7			*
8			* SPDX-License-Identifier: LGPL-3.0-or-later
9			*/
10
11			/*!
12			* \page volk_32f_atan_32f
13			*
14			* \b Overview
15			*
16			* Computes arcsine of input vector and stores results in output vector.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
21			* \endcode
22			*
23			* \b Inputs
24			* \li aVector: The input vector of floats.
25			* \li num_points: The number of data points.
26			*
27			* \b Outputs
28			* \li bVector: The vector where results will be stored.
29			*
30			* \b Example
31			* Calculate common angles around the top half of the unit circle.
32			* \code
33			* int N = 10;
34			* unsigned int alignment = volk_get_alignment();
35			* float* in = (float)volk_malloc(sizeof(float)N, alignment);
36			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
37			*
38			* in[0] = 0.f;
39			* in[1] = 1.f/std::sqrt(3.f);
40			* in[2] = 1.f;
41			* in[3] = std::sqrt(3.f);
42			* in[4] = in[5] = 1e99;
43			* for(unsigned int ii = 6; ii < N; ++ii){
44			* in[ii] = - in[N-ii-1];
45			* }
46			*
47			* volk_32f_atan_32f(out, in, N);
48			*
49			* for(unsigned int ii = 0; ii < N; ++ii){
50			* printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]);
51			* }
52			*
53			* volk_free(in);
54			* volk_free(out);
55			* \endcode
56			*/
57			#include <math.h>
58
59			#ifndef INCLUDED_volk_32f_atan_32f_a_H
60			#define INCLUDED_volk_32f_atan_32f_a_H
61
62			#if LV_HAVE_AVX2 && LV_HAVE_FMA
63			#include <immintrin.h>
64			#include <volk/volk_avx2_fma_intrinsics.h>
65			static inline void
66		2	volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
67			{
68		2	const __m256 one = _mm256_set1_ps(1.f);
69		2	const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
70		4	const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
71		2	const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
72
73		2	unsigned int number = 0;
74		2	unsigned int eighth_points = num_points / 8;
75	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighth_points; number++) {
76		32766	__m256 x = _mm256_load_ps(in);
77		32766	__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
78		65532	__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
79			_mm256_blendv_ps(one, x, swap_mask));
80		32766	__m256 result = _m256_arctan_poly_avx2_fma(x_star);
81		32766	__m256 term = _mm256_and_ps(x_star, sign_mask);
82		32766	term = _mm256_or_ps(pi_over_2, term);
83		32766	term = _mm256_sub_ps(term, result);
84		32766	result = _mm256_blendv_ps(result, term, swap_mask);
85			_mm256_store_ps(out, result);
86		32766	in += 8;
87		32766	out += 8;
88			}
89
90		2	number = eighth_points * 8;
91	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
92		14	out++ = volk_arctan(in++);
93			}
94		2	}
95			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
96
97			#if LV_HAVE_AVX
98			#include <immintrin.h>
99			#include <volk/volk_avx_intrinsics.h>
100			static inline void
101		2	volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
102			{
103		2	const __m256 one = _mm256_set1_ps(1.f);
104		2	const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
105		4	const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
106		2	const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
107
108		2	unsigned int number = 0;
109		2	unsigned int eighth_points = num_points / 8;
110	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighth_points; number++) {
111		32766	__m256 x = _mm256_load_ps(in);
112		32766	__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
113		65532	__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
114			_mm256_blendv_ps(one, x, swap_mask));
115		32766	__m256 result = _m256_arctan_poly_avx(x_star);
116		32766	__m256 term = _mm256_and_ps(x_star, sign_mask);
117		32766	term = _mm256_or_ps(pi_over_2, term);
118		32766	term = _mm256_sub_ps(term, result);
119		32766	result = _mm256_blendv_ps(result, term, swap_mask);
120			_mm256_store_ps(out, result);
121		32766	in += 8;
122		32766	out += 8;
123			}
124
125		2	number = eighth_points * 8;
126	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
127		14	out++ = volk_arctan(in++);
128			}
129		2	}
130			#endif /* LV_HAVE_AVX for aligned */
131
132			#ifdef LV_HAVE_SSE4_1
133			#include <smmintrin.h>
134			#include <volk/volk_sse_intrinsics.h>
135			static inline void
136		2	volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
137			{
138		2	const __m128 one = _mm_set1_ps(1.f);
139		2	const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
140		4	const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
141		2	const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
142
143		2	unsigned int number = 0;
144		2	unsigned int quarter_points = num_points / 4;
145	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarter_points; number++) {
146		65534	__m128 x = _mm_load_ps(in);
147		131068	__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
148		131068	__m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
149			_mm_blendv_ps(one, x, swap_mask));
150		65534	__m128 result = _mm_arctan_poly_sse(x_star);
151		65534	__m128 term = _mm_and_ps(x_star, sign_mask);
152		65534	term = _mm_or_ps(pi_over_2, term);
153		65534	term = _mm_sub_ps(term, result);
154		65534	result = _mm_blendv_ps(result, term, swap_mask);
155			_mm_store_ps(out, result);
156		65534	in += 4;
157		65534	out += 4;
158			}
159
160		2	number = quarter_points * 4;
161	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
162		6	out++ = volk_arctan(in++);
163			}
164		2	}
165			#endif /* LV_HAVE_SSE4_1 for aligned */
166			#endif /* INCLUDED_volk_32f_atan_32f_a_H */
167
168			#ifndef INCLUDED_volk_32f_atan_32f_u_H
169			#define INCLUDED_volk_32f_atan_32f_u_H
170
171			#if LV_HAVE_AVX2 && LV_HAVE_FMA
172			#include <immintrin.h>
173			static inline void
174		2	volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
175			{
176		2	const __m256 one = _mm256_set1_ps(1.f);
177		2	const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
178		4	const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179		2	const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180
181		2	unsigned int number = 0;
182		2	unsigned int eighth_points = num_points / 8;
183	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighth_points; number++) {
184		32766	__m256 x = _mm256_loadu_ps(in);
185		32766	__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
186		65532	__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
187			_mm256_blendv_ps(one, x, swap_mask));
188		32766	__m256 result = _m256_arctan_poly_avx2_fma(x_star);
189		32766	__m256 term = _mm256_and_ps(x_star, sign_mask);
190		32766	term = _mm256_or_ps(pi_over_2, term);
191		32766	term = _mm256_sub_ps(term, result);
192		32766	result = _mm256_blendv_ps(result, term, swap_mask);
193			_mm256_storeu_ps(out, result);
194		32766	in += 8;
195		32766	out += 8;
196			}
197
198		2	number = eighth_points * 8;
199	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
200		14	out++ = volk_arctan(in++);
201			}
202		2	}
203			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
204
205			#if LV_HAVE_AVX
206			#include <immintrin.h>
207			static inline void
208		2	volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
209			{
210		2	const __m256 one = _mm256_set1_ps(1.f);
211		2	const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
212		4	const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
213		2	const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
214
215		2	unsigned int number = 0;
216		2	unsigned int eighth_points = num_points / 8;
217	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighth_points; number++) {
218		32766	__m256 x = _mm256_loadu_ps(in);
219		32766	__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
220		65532	__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
221			_mm256_blendv_ps(one, x, swap_mask));
222		32766	__m256 result = _m256_arctan_poly_avx(x_star);
223		32766	__m256 term = _mm256_and_ps(x_star, sign_mask);
224		32766	term = _mm256_or_ps(pi_over_2, term);
225		32766	term = _mm256_sub_ps(term, result);
226		32766	result = _mm256_blendv_ps(result, term, swap_mask);
227			_mm256_storeu_ps(out, result);
228		32766	in += 8;
229		32766	out += 8;
230			}
231
232		2	number = eighth_points * 8;
233	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
234		14	out++ = volk_arctan(in++);
235			}
236		2	}
237			#endif /* LV_HAVE_AVX for unaligned */
238
239			#ifdef LV_HAVE_SSE4_1
240			#include <smmintrin.h>
241			#include <volk/volk_sse_intrinsics.h>
242			static inline void
243		2	volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
244			{
245		2	const __m128 one = _mm_set1_ps(1.f);
246		2	const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
247		4	const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
248		2	const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
249
250		2	unsigned int number = 0;
251		2	unsigned int quarter_points = num_points / 4;
252	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarter_points; number++) {
253		65534	__m128 x = _mm_loadu_ps(in);
254		131068	__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
255		131068	__m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
256			_mm_blendv_ps(one, x, swap_mask));
257		65534	__m128 result = _mm_arctan_poly_sse(x_star);
258		65534	__m128 term = _mm_and_ps(x_star, sign_mask);
259		65534	term = _mm_or_ps(pi_over_2, term);
260		65534	term = _mm_sub_ps(term, result);
261		65534	result = _mm_blendv_ps(result, term, swap_mask);
262			_mm_storeu_ps(out, result);
263		65534	in += 4;
264		65534	out += 4;
265			}
266
267		2	number = quarter_points * 4;
268	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
269		6	out++ = volk_arctan(in++);
270			}
271		2	}
272			#endif /* LV_HAVE_SSE4_1 for unaligned */
273
274			#ifdef LV_HAVE_GENERIC
275			static inline void
276		2	volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
277			{
278		2	unsigned int number = 0;
279	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; number++) {
280		262142	out++ = volk_arctan(in++);
281			}
282		2	}
283			#endif /* LV_HAVE_GENERIC */
284
285			#ifdef LV_HAVE_GENERIC
286			static inline void
287		2	volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
288			{
289		2	unsigned int number = 0;
290	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (; number < num_points; number++) {
291		262142	out++ = atanf(in++);
292			}
293		2	}
294			#endif /* LV_HAVE_GENERIC */
295
296			#endif /* INCLUDED_volk_32f_atan_32f_u_H */
297