GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	163	168	97.0%
Functions:	7	7	100.0%
Branches:	15	18	83.3%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2017 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_s32f_s32f_mod_range_32f
    
       *
    
       * \b wraps floating point numbers to stay within a defined [min,max] range
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
    
       * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li inputVector: The input vector
    
       * \li lower_bound: The lower output boundary
    
       * \li upper_bound: The upper output boundary
    
       * \li num_points The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The vector where the results will be stored.
    
       *
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
    
      #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
    
      #ifdef LV_HAVE_GENERIC
    
      14
      static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
    
                                                                  const float* inputVector,
    
                                                                  const float lower_bound,
    
                                                                  const float upper_bound,
    
                                                                  unsigned int num_points)
    
      {
    
      14
          float* outPtr = outputVector;
    
          const float* inPtr;
    
      14
          const float distance = upper_bound - lower_bound;
    
        2/2✓ Branch 0 taken 262194 times.
✓ Branch 1 taken 14 times.

      262208
          for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
    
      262194
              float val = *inPtr;
    
        1/2✓ Branch 0 taken 262194 times.
✗ Branch 1 not taken.

      262194
              if (val < lower_bound) {
    
      262194
                  float excess = lower_bound - val;
    
      262194
                  signed int count = (int)(excess / distance);
    
      262194
                  *outPtr = val + (count + 1) * distance;
    
      ✗
              } else if (val > upper_bound) {
    
      ✗
                  float excess = val - upper_bound;
    
      ✗
                  signed int count = (int)(excess / distance);
    
      ✗
                  *outPtr = val - (count + 1) * distance;
    
              } else
    
      ✗
                  *outPtr = val;
    
      262194
              outPtr++;
    
          }
    
      14
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_AVX
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
    
                                                                const float* inputVector,
    
                                                                const float lower_bound,
    
                                                                const float upper_bound,
    
                                                                unsigned int num_points)
    
      {
    
      2
          const __m256 lower = _mm256_set1_ps(lower_bound);
    
      2
          const __m256 upper = _mm256_set1_ps(upper_bound);
    
      2
          const __m256 distance = _mm256_sub_ps(upper, lower);
    
          __m256 input, output;
    
          __m256 is_smaller, is_bigger;
    
          __m256 excess, adj;
    
      2
          const float* inPtr = inputVector;
    
      2
          float* outPtr = outputVector;
    
      2
          const size_t eight_points = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (size_t counter = 0; counter < eight_points; counter++) {
    
      32766
              input = _mm256_loadu_ps(inPtr);
    
              // calculate mask: input < lower, input > upper
    
      32766
              is_smaller = _mm256_cmp_ps(
    
                  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
    
      32766
              is_bigger = _mm256_cmp_ps(
    
                  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
    
              // find out how far we are out-of-bound – positive values!
    
      65532
              excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
    
              excess =
    
      98298
                  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
    
              // how many do we have to add? (int(excess/distance+1)*distance)
    
      32766
              excess = _mm256_div_ps(excess, distance);
    
              // round down
    
      65532
              excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
    
              // plus 1
    
      32766
              adj = _mm256_set1_ps(1.0f);
    
      32766
              excess = _mm256_add_ps(excess, adj);
    
              // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
    
      32766
              adj = _mm256_and_ps(adj, is_smaller);
    
      98298
              adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
    
              // scale by distance, sign
    
      65532
              excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
    
      32766
              output = _mm256_add_ps(input, excess);
    
              _mm256_storeu_ps(outPtr, output);
    
      32766
              inPtr += 8;
    
      32766
              outPtr += 8;
    
          }
    
      2
          volk_32f_s32f_s32f_mod_range_32f_generic(
    
              outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
    
      2
      }
    
      2
      static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
    
                                                                const float* inputVector,
    
                                                                const float lower_bound,
    
                                                                const float upper_bound,
    
                                                                unsigned int num_points)
    
      {
    
      2
          const __m256 lower = _mm256_set1_ps(lower_bound);
    
      2
          const __m256 upper = _mm256_set1_ps(upper_bound);
    
      2
          const __m256 distance = _mm256_sub_ps(upper, lower);
    
          __m256 input, output;
    
          __m256 is_smaller, is_bigger;
    
          __m256 excess, adj;
    
      2
          const float* inPtr = inputVector;
    
      2
          float* outPtr = outputVector;
    
      2
          const size_t eight_points = num_points / 8;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (size_t counter = 0; counter < eight_points; counter++) {
    
      32766
              input = _mm256_load_ps(inPtr);
    
              // calculate mask: input < lower, input > upper
    
      32766
              is_smaller = _mm256_cmp_ps(
    
                  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
    
      32766
              is_bigger = _mm256_cmp_ps(
    
                  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
    
              // find out how far we are out-of-bound – positive values!
    
      65532
              excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
    
              excess =
    
      98298
                  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
    
              // how many do we have to add? (int(excess/distance+1)*distance)
    
      32766
              excess = _mm256_div_ps(excess, distance);
    
              // round down
    
      65532
              excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
    
              // plus 1
    
      32766
              adj = _mm256_set1_ps(1.0f);
    
      32766
              excess = _mm256_add_ps(excess, adj);
    
              // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
    
      32766
              adj = _mm256_and_ps(adj, is_smaller);
    
      98298
              adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
    
              // scale by distance, sign
    
      65532
              excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
    
      32766
              output = _mm256_add_ps(input, excess);
    
              _mm256_store_ps(outPtr, output);
    
      32766
              inPtr += 8;
    
      32766
              outPtr += 8;
    
          }
    
      2
          volk_32f_s32f_s32f_mod_range_32f_generic(
    
              outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE2
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
    
                                                                 const float* inputVector,
    
                                                                 const float lower_bound,
    
                                                                 const float upper_bound,
    
                                                                 unsigned int num_points)
    
      {
    
      2
          const __m128 lower = _mm_set_ps1(lower_bound);
    
      2
          const __m128 upper = _mm_set_ps1(upper_bound);
    
      2
          const __m128 distance = _mm_sub_ps(upper, lower);
    
          __m128 input, output;
    
          __m128 is_smaller, is_bigger;
    
          __m128 excess, adj;
    
      2
          const float* inPtr = inputVector;
    
      2
          float* outPtr = outputVector;
    
      2
          const size_t quarter_points = num_points / 4;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (size_t counter = 0; counter < quarter_points; counter++) {
    
      65534
              input = _mm_loadu_ps(inPtr);
    
              // calculate mask: input < lower, input > upper
    
      65534
              is_smaller = _mm_cmplt_ps(input, lower);
    
      65534
              is_bigger = _mm_cmpgt_ps(input, upper);
    
              // find out how far we are out-of-bound – positive values!
    
      131068
              excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
    
      196602
              excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
    
              // how many do we have to add? (int(excess/distance+1)*distance)
    
      65534
              excess = _mm_div_ps(excess, distance);
    
              // round down
    
      131068
              excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
    
              // plus 1
    
      65534
              adj = _mm_set_ps1(1.0f);
    
      65534
              excess = _mm_add_ps(excess, adj);
    
              // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
    
      65534
              adj = _mm_and_ps(adj, is_smaller);
    
      196602
              adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
    
              // scale by distance, sign
    
      131068
              excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
    
      65534
              output = _mm_add_ps(input, excess);
    
              _mm_storeu_ps(outPtr, output);
    
      65534
              inPtr += 4;
    
      65534
              outPtr += 4;
    
          }
    
      2
          volk_32f_s32f_s32f_mod_range_32f_generic(
    
              outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
    
      2
      }
    
      2
      static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
    
                                                                 const float* inputVector,
    
                                                                 const float lower_bound,
    
                                                                 const float upper_bound,
    
                                                                 unsigned int num_points)
    
      {
    
      2
          const __m128 lower = _mm_set_ps1(lower_bound);
    
      2
          const __m128 upper = _mm_set_ps1(upper_bound);
    
      2
          const __m128 distance = _mm_sub_ps(upper, lower);
    
          __m128 input, output;
    
          __m128 is_smaller, is_bigger;
    
          __m128 excess, adj;
    
      2
          const float* inPtr = inputVector;
    
      2
          float* outPtr = outputVector;
    
      2
          const size_t quarter_points = num_points / 4;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (size_t counter = 0; counter < quarter_points; counter++) {
    
      65534
              input = _mm_load_ps(inPtr);
    
              // calculate mask: input < lower, input > upper
    
      65534
              is_smaller = _mm_cmplt_ps(input, lower);
    
      65534
              is_bigger = _mm_cmpgt_ps(input, upper);
    
              // find out how far we are out-of-bound – positive values!
    
      131068
              excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
    
      196602
              excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
    
              // how many do we have to add? (int(excess/distance+1)*distance)
    
      65534
              excess = _mm_div_ps(excess, distance);
    
              // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
    
              // conversion.
    
      131068
              excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
    
              // plus 1
    
      65534
              adj = _mm_set_ps1(1.0f);
    
      65534
              excess = _mm_add_ps(excess, adj);
    
              // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
    
      65534
              adj = _mm_and_ps(adj, is_smaller);
    
      196602
              adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
    
              // scale by distance, sign
    
      131068
              excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
    
      65534
              output = _mm_add_ps(input, excess);
    
              _mm_store_ps(outPtr, output);
    
      65534
              inPtr += 4;
    
      65534
              outPtr += 4;
    
          }
    
      2
          volk_32f_s32f_s32f_mod_range_32f_generic(
    
              outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
    
                                                                const float* inputVector,
    
                                                                const float lower_bound,
    
                                                                const float upper_bound,
    
                                                                unsigned int num_points)
    
      {
    
      2
          const __m128 lower = _mm_set_ps1(lower_bound);
    
      2
          const __m128 upper = _mm_set_ps1(upper_bound);
    
      2
          const __m128 distance = _mm_sub_ps(upper, lower);
    
          __m128 input, output;
    
          __m128 is_smaller, is_bigger;
    
          __m128 excess, adj;
    
          __m128i rounddown;
    
      2
          const float* inPtr = inputVector;
    
      2
          float* outPtr = outputVector;
    
      2
          const size_t quarter_points = num_points / 4;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (size_t counter = 0; counter < quarter_points; counter++) {
    
      65534
              input = _mm_loadu_ps(inPtr);
    
              // calculate mask: input < lower, input > upper
    
      65534
              is_smaller = _mm_cmplt_ps(input, lower);
    
      65534
              is_bigger = _mm_cmpgt_ps(input, upper);
    
              // find out how far we are out-of-bound – positive values!
    
      131068
              excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
    
      196602
              excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
    
              // how many do we have to add? (int(excess/distance+1)*distance)
    
      65534
              excess = _mm_div_ps(excess, distance);
    
              // round down – for some reason
    
      65534
              rounddown = _mm_cvttps_epi32(excess);
    
      65534
              excess = _mm_cvtepi32_ps(rounddown);
    
              // plus 1
    
      65534
              adj = _mm_set_ps1(1.0f);
    
      65534
              excess = _mm_add_ps(excess, adj);
    
              // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
    
      65534
              adj = _mm_and_ps(adj, is_smaller);
    
      196602
              adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
    
              // scale by distance, sign
    
      131068
              excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
    
      65534
              output = _mm_add_ps(input, excess);
    
              _mm_storeu_ps(outPtr, output);
    
      65534
              inPtr += 4;
    
      65534
              outPtr += 4;
    
          }
    
      2
          volk_32f_s32f_s32f_mod_range_32f_generic(
    
              outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
    
      2
      }
    
      2
      static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
    
                                                                const float* inputVector,
    
                                                                const float lower_bound,
    
                                                                const float upper_bound,
    
                                                                unsigned int num_points)
    
      {
    
      2
          const __m128 lower = _mm_set_ps1(lower_bound);
    
      2
          const __m128 upper = _mm_set_ps1(upper_bound);
    
      2
          const __m128 distance = _mm_sub_ps(upper, lower);
    
          __m128 input, output;
    
          __m128 is_smaller, is_bigger;
    
          __m128 excess, adj;
    
          __m128i rounddown;
    
      2
          const float* inPtr = inputVector;
    
      2
          float* outPtr = outputVector;
    
      2
          const size_t quarter_points = num_points / 4;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (size_t counter = 0; counter < quarter_points; counter++) {
    
      65534
              input = _mm_load_ps(inPtr);
    
              // calculate mask: input < lower, input > upper
    
      65534
              is_smaller = _mm_cmplt_ps(input, lower);
    
      65534
              is_bigger = _mm_cmpgt_ps(input, upper);
    
              // find out how far we are out-of-bound – positive values!
    
      131068
              excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
    
      196602
              excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
    
              // how many do we have to add? (int(excess/distance+1)*distance)
    
      65534
              excess = _mm_div_ps(excess, distance);
    
              // round down
    
      65534
              rounddown = _mm_cvttps_epi32(excess);
    
      65534
              excess = _mm_cvtepi32_ps(rounddown);
    
              // plus 1
    
      65534
              adj = _mm_set_ps1(1.0f);
    
      65534
              excess = _mm_add_ps(excess, adj);
    
              // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
    
      65534
              adj = _mm_and_ps(adj, is_smaller);
    
      196602
              adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
    
              // scale by distance, sign
    
      131068
              excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
    
      65534
              output = _mm_add_ps(input, excess);
    
              _mm_store_ps(outPtr, output);
    
      65534
              inPtr += 4;
    
      65534
              outPtr += 4;
    
          }
    
      2
          volk_32f_s32f_s32f_mod_range_32f_generic(
    
              outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2017 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_s32f_s32f_mod_range_32f
12			*
13			* \b wraps floating point numbers to stay within a defined [min,max] range
14			*
15			* <b>Dispatcher Prototype</b>
16			* \code
17			* void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
18			* const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
19			*
20			* \b Inputs
21			* \li inputVector: The input vector
22			* \li lower_bound: The lower output boundary
23			* \li upper_bound: The upper output boundary
24			* \li num_points The number of data points.
25			*
26			* \b Outputs
27			* \li outputVector: The vector where the results will be stored.
28			*
29			* \endcode
30			*/
31
32			#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33			#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
34
35			#ifdef LV_HAVE_GENERIC
36
37		14	static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
38			const float* inputVector,
39			const float lower_bound,
40			const float upper_bound,
41			unsigned int num_points)
42			{
43		14	float* outPtr = outputVector;
44			const float* inPtr;
45		14	const float distance = upper_bound - lower_bound;
46
47	2/2 ✓ Branch 0 taken 262194 times. ✓ Branch 1 taken 14 times.	262208	for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
48		262194	float val = *inPtr;
49	1/2 ✓ Branch 0 taken 262194 times. ✗ Branch 1 not taken.	262194	if (val < lower_bound) {
50		262194	float excess = lower_bound - val;
51		262194	signed int count = (int)(excess / distance);
52		262194	outPtr = val + (count + 1) distance;
53		✗	} else if (val > upper_bound) {
54		✗	float excess = val - upper_bound;
55		✗	signed int count = (int)(excess / distance);
56		✗	outPtr = val - (count + 1) distance;
57			} else
58		✗	*outPtr = val;
59		262194	outPtr++;
60			}
61		14	}
62			#endif /* LV_HAVE_GENERIC */
63
64
65			#ifdef LV_HAVE_AVX
66			#include <xmmintrin.h>
67
68		2	static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
69			const float* inputVector,
70			const float lower_bound,
71			const float upper_bound,
72			unsigned int num_points)
73			{
74		2	const __m256 lower = _mm256_set1_ps(lower_bound);
75		2	const __m256 upper = _mm256_set1_ps(upper_bound);
76		2	const __m256 distance = _mm256_sub_ps(upper, lower);
77			__m256 input, output;
78			__m256 is_smaller, is_bigger;
79			__m256 excess, adj;
80
81		2	const float* inPtr = inputVector;
82		2	float* outPtr = outputVector;
83		2	const size_t eight_points = num_points / 8;
84	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (size_t counter = 0; counter < eight_points; counter++) {
85		32766	input = _mm256_loadu_ps(inPtr);
86			// calculate mask: input < lower, input > upper
87		32766	is_smaller = _mm256_cmp_ps(
88			input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
89		32766	is_bigger = _mm256_cmp_ps(
90			input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
91			// find out how far we are out-of-bound – positive values!
92		65532	excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
93			excess =
94		98298	_mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
95			// how many do we have to add? (int(excess/distance+1)*distance)
96		32766	excess = _mm256_div_ps(excess, distance);
97			// round down
98		65532	excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
99			// plus 1
100		32766	adj = _mm256_set1_ps(1.0f);
101		32766	excess = _mm256_add_ps(excess, adj);
102			// get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
103		32766	adj = _mm256_and_ps(adj, is_smaller);
104		98298	adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
105			// scale by distance, sign
106		65532	excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107		32766	output = _mm256_add_ps(input, excess);
108			_mm256_storeu_ps(outPtr, output);
109		32766	inPtr += 8;
110		32766	outPtr += 8;
111			}
112
113		2	volk_32f_s32f_s32f_mod_range_32f_generic(
114			outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
115		2	}
116		2	static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
117			const float* inputVector,
118			const float lower_bound,
119			const float upper_bound,
120			unsigned int num_points)
121			{
122		2	const __m256 lower = _mm256_set1_ps(lower_bound);
123		2	const __m256 upper = _mm256_set1_ps(upper_bound);
124		2	const __m256 distance = _mm256_sub_ps(upper, lower);
125			__m256 input, output;
126			__m256 is_smaller, is_bigger;
127			__m256 excess, adj;
128
129		2	const float* inPtr = inputVector;
130		2	float* outPtr = outputVector;
131		2	const size_t eight_points = num_points / 8;
132	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (size_t counter = 0; counter < eight_points; counter++) {
133		32766	input = _mm256_load_ps(inPtr);
134			// calculate mask: input < lower, input > upper
135		32766	is_smaller = _mm256_cmp_ps(
136			input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
137		32766	is_bigger = _mm256_cmp_ps(
138			input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
139			// find out how far we are out-of-bound – positive values!
140		65532	excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
141			excess =
142		98298	_mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
143			// how many do we have to add? (int(excess/distance+1)*distance)
144		32766	excess = _mm256_div_ps(excess, distance);
145			// round down
146		65532	excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
147			// plus 1
148		32766	adj = _mm256_set1_ps(1.0f);
149		32766	excess = _mm256_add_ps(excess, adj);
150			// get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
151		32766	adj = _mm256_and_ps(adj, is_smaller);
152		98298	adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
153			// scale by distance, sign
154		65532	excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155		32766	output = _mm256_add_ps(input, excess);
156			_mm256_store_ps(outPtr, output);
157		32766	inPtr += 8;
158		32766	outPtr += 8;
159			}
160
161		2	volk_32f_s32f_s32f_mod_range_32f_generic(
162			outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
163		2	}
164			#endif /* LV_HAVE_AVX */
165
166
167			#ifdef LV_HAVE_SSE2
168			#include <xmmintrin.h>
169
170		2	static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
171			const float* inputVector,
172			const float lower_bound,
173			const float upper_bound,
174			unsigned int num_points)
175			{
176		2	const __m128 lower = _mm_set_ps1(lower_bound);
177		2	const __m128 upper = _mm_set_ps1(upper_bound);
178		2	const __m128 distance = _mm_sub_ps(upper, lower);
179			__m128 input, output;
180			__m128 is_smaller, is_bigger;
181			__m128 excess, adj;
182
183		2	const float* inPtr = inputVector;
184		2	float* outPtr = outputVector;
185		2	const size_t quarter_points = num_points / 4;
186	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (size_t counter = 0; counter < quarter_points; counter++) {
187		65534	input = _mm_loadu_ps(inPtr);
188			// calculate mask: input < lower, input > upper
189		65534	is_smaller = _mm_cmplt_ps(input, lower);
190		65534	is_bigger = _mm_cmpgt_ps(input, upper);
191			// find out how far we are out-of-bound – positive values!
192		131068	excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
193		196602	excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
194			// how many do we have to add? (int(excess/distance+1)*distance)
195		65534	excess = _mm_div_ps(excess, distance);
196			// round down
197		131068	excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
198			// plus 1
199		65534	adj = _mm_set_ps1(1.0f);
200		65534	excess = _mm_add_ps(excess, adj);
201			// get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
202		65534	adj = _mm_and_ps(adj, is_smaller);
203		196602	adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
204			// scale by distance, sign
205		131068	excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
206		65534	output = _mm_add_ps(input, excess);
207			_mm_storeu_ps(outPtr, output);
208		65534	inPtr += 4;
209		65534	outPtr += 4;
210			}
211
212		2	volk_32f_s32f_s32f_mod_range_32f_generic(
213			outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
214		2	}
215		2	static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
216			const float* inputVector,
217			const float lower_bound,
218			const float upper_bound,
219			unsigned int num_points)
220			{
221		2	const __m128 lower = _mm_set_ps1(lower_bound);
222		2	const __m128 upper = _mm_set_ps1(upper_bound);
223		2	const __m128 distance = _mm_sub_ps(upper, lower);
224			__m128 input, output;
225			__m128 is_smaller, is_bigger;
226			__m128 excess, adj;
227
228		2	const float* inPtr = inputVector;
229		2	float* outPtr = outputVector;
230		2	const size_t quarter_points = num_points / 4;
231	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (size_t counter = 0; counter < quarter_points; counter++) {
232		65534	input = _mm_load_ps(inPtr);
233			// calculate mask: input < lower, input > upper
234		65534	is_smaller = _mm_cmplt_ps(input, lower);
235		65534	is_bigger = _mm_cmpgt_ps(input, upper);
236			// find out how far we are out-of-bound – positive values!
237		131068	excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
238		196602	excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
239			// how many do we have to add? (int(excess/distance+1)*distance)
240		65534	excess = _mm_div_ps(excess, distance);
241			// round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
242			// conversion.
243		131068	excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
244			// plus 1
245		65534	adj = _mm_set_ps1(1.0f);
246		65534	excess = _mm_add_ps(excess, adj);
247			// get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
248		65534	adj = _mm_and_ps(adj, is_smaller);
249		196602	adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
250			// scale by distance, sign
251		131068	excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
252		65534	output = _mm_add_ps(input, excess);
253			_mm_store_ps(outPtr, output);
254		65534	inPtr += 4;
255		65534	outPtr += 4;
256			}
257
258		2	volk_32f_s32f_s32f_mod_range_32f_generic(
259			outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
260		2	}
261			#endif /* LV_HAVE_SSE2 */
262
263			#ifdef LV_HAVE_SSE
264			#include <xmmintrin.h>
265
266		2	static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
267			const float* inputVector,
268			const float lower_bound,
269			const float upper_bound,
270			unsigned int num_points)
271			{
272		2	const __m128 lower = _mm_set_ps1(lower_bound);
273		2	const __m128 upper = _mm_set_ps1(upper_bound);
274		2	const __m128 distance = _mm_sub_ps(upper, lower);
275			__m128 input, output;
276			__m128 is_smaller, is_bigger;
277			__m128 excess, adj;
278			__m128i rounddown;
279
280		2	const float* inPtr = inputVector;
281		2	float* outPtr = outputVector;
282		2	const size_t quarter_points = num_points / 4;
283	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (size_t counter = 0; counter < quarter_points; counter++) {
284		65534	input = _mm_loadu_ps(inPtr);
285			// calculate mask: input < lower, input > upper
286		65534	is_smaller = _mm_cmplt_ps(input, lower);
287		65534	is_bigger = _mm_cmpgt_ps(input, upper);
288			// find out how far we are out-of-bound – positive values!
289		131068	excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
290		196602	excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
291			// how many do we have to add? (int(excess/distance+1)*distance)
292		65534	excess = _mm_div_ps(excess, distance);
293			// round down – for some reason
294		65534	rounddown = _mm_cvttps_epi32(excess);
295		65534	excess = _mm_cvtepi32_ps(rounddown);
296			// plus 1
297		65534	adj = _mm_set_ps1(1.0f);
298		65534	excess = _mm_add_ps(excess, adj);
299			// get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
300		65534	adj = _mm_and_ps(adj, is_smaller);
301		196602	adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
302			// scale by distance, sign
303		131068	excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
304		65534	output = _mm_add_ps(input, excess);
305			_mm_storeu_ps(outPtr, output);
306		65534	inPtr += 4;
307		65534	outPtr += 4;
308			}
309
310		2	volk_32f_s32f_s32f_mod_range_32f_generic(
311			outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
312		2	}
313		2	static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
314			const float* inputVector,
315			const float lower_bound,
316			const float upper_bound,
317			unsigned int num_points)
318			{
319		2	const __m128 lower = _mm_set_ps1(lower_bound);
320		2	const __m128 upper = _mm_set_ps1(upper_bound);
321		2	const __m128 distance = _mm_sub_ps(upper, lower);
322			__m128 input, output;
323			__m128 is_smaller, is_bigger;
324			__m128 excess, adj;
325			__m128i rounddown;
326
327		2	const float* inPtr = inputVector;
328		2	float* outPtr = outputVector;
329		2	const size_t quarter_points = num_points / 4;
330	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (size_t counter = 0; counter < quarter_points; counter++) {
331		65534	input = _mm_load_ps(inPtr);
332			// calculate mask: input < lower, input > upper
333		65534	is_smaller = _mm_cmplt_ps(input, lower);
334		65534	is_bigger = _mm_cmpgt_ps(input, upper);
335			// find out how far we are out-of-bound – positive values!
336		131068	excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
337		196602	excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
338			// how many do we have to add? (int(excess/distance+1)*distance)
339		65534	excess = _mm_div_ps(excess, distance);
340			// round down
341		65534	rounddown = _mm_cvttps_epi32(excess);
342		65534	excess = _mm_cvtepi32_ps(rounddown);
343			// plus 1
344		65534	adj = _mm_set_ps1(1.0f);
345		65534	excess = _mm_add_ps(excess, adj);
346			// get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
347		65534	adj = _mm_and_ps(adj, is_smaller);
348		196602	adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
349			// scale by distance, sign
350		131068	excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
351		65534	output = _mm_add_ps(input, excess);
352			_mm_store_ps(outPtr, output);
353		65534	inPtr += 4;
354		65534	outPtr += 4;
355			}
356
357		2	volk_32f_s32f_s32f_mod_range_32f_generic(
358			outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
359		2	}
360			#endif /* LV_HAVE_SSE */
361
362
363			#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
364