GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
Date:	2023-10-23 23:10:04
	Exec	Total	Coverage
Lines:	314	314	100.0%
Functions:	7	7	100.0%
Branches:	67	76	88.2%
  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_s32fc_x2_rotator_32fc
    
       *
    
       * \b Overview
    
       *
    
       * Rotate input vector at fixed rate per sample from initial phase
    
       * offset.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
    
       * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li inVector: Vector to be rotated.
    
       * \li phase_inc: rotational velocity.
    
       * \li phase: initial phase offset.
    
       * \li num_points: The number of values in inVector to be rotated and stored into
    
       * outVector.
    
       *
    
       * \b Outputs
    
       * \li outVector: The vector where the results will be stored.
    
       *
    
       * \b Example
    
       * Generate a tone at f=0.3 (normalized frequency) and use the rotator with
    
       * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC
    
       * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation.
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       // Generate a tone at f=0.3
    
       *       float real = std::cos(0.3f * (float)ii);
    
       *       float imag = std::sin(0.3f * (float)ii);
    
       *       in[ii] = lv_cmake(real, imag);
    
       *   }
    
       *   // The oscillator rotates at f=0.1
    
       *   float frequency = 0.1f;
    
       *   lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency));
    
       *   lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase)
    
       *
    
       *   // rotate so the output is a tone at f=0.4
    
       *   volk_32fc_s32fc_x2_rotator_32fc(out, in, phase_increment, &phase, N);
    
       *
    
       *   // print results for inspection
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %+1.2f %+1.2fj\n",
    
       *           ii, lv_creal(out[ii]), lv_cimag(out[ii]));
    
       *   }
    
       *
    
       *   volk_free(in);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
    
      #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
    
      #include <math.h>
    
      #include <stdio.h>
    
      #include <stdlib.h>
    
      #include <volk/volk_complex.h>
    
      #define ROTATOR_RELOAD 512
    
      #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
    
      #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
    
      #ifdef LV_HAVE_GENERIC
    
      6
      static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
    
                                                                 const lv_32fc_t* inVector,
    
                                                                 const lv_32fc_t phase_inc,
    
                                                                 lv_32fc_t* phase,
    
                                                                 unsigned int num_points)
    
      {
    
      6
          unsigned int i = 0;
    
      6
          int j = 0;
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 6 times.

      516
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
    
        2/2✓ Branch 0 taken 261120 times.
✓ Branch 1 taken 510 times.

      261630
              for (j = 0; j < ROTATOR_RELOAD; ++j) {
    
      261120
                  *outVector++ = *inVector++ * (*phase);
    
      261120
                  (*phase) *= phase_inc;
    
              }
    
      510
              (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
    
          }
    
        2/2✓ Branch 0 taken 1034 times.
✓ Branch 1 taken 6 times.

      1040
          for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
    
      1034
              *outVector++ = *inVector++ * (*phase);
    
      1034
              (*phase) *= phase_inc;
    
          }
    
        1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.

      6
          if (i) {
    
              // Make sure, we normalize phase on every call!
    
      6
              (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
    
          }
    
      6
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      #include <volk/volk_neon_intrinsics.h>
    
      static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
    
                                                              const lv_32fc_t* inVector,
    
                                                              const lv_32fc_t phase_inc,
    
                                                              lv_32fc_t* phase,
    
                                                              unsigned int num_points)
    
      {
    
          lv_32fc_t* outputVectorPtr = outVector;
    
          const lv_32fc_t* inputVectorPtr = inVector;
    
          lv_32fc_t incr = 1;
    
          lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
    
          float32x4x2_t input_vec;
    
          float32x4x2_t output_vec;
    
          unsigned int i = 0, j = 0;
    
          // const unsigned int quarter_points = num_points / 4;
    
          for (i = 0; i < 4; ++i) {
    
              phasePtr[i] *= incr;
    
              incr *= (phase_inc);
    
          }
    
          // Notice that incr has be incremented in the previous loop
    
          const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
    
          const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
    
          float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
    
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
    
              for (j = 0; j < ROTATOR_RELOAD_4; j++) {
    
                  input_vec = vld2q_f32((float*)inputVectorPtr);
    
                  // Prefetch next one, speeds things up
    
                  __VOLK_PREFETCH(inputVectorPtr + 4);
    
                  // Rotate
    
                  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
    
                  // Increase phase
    
                  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
    
                  // Store output
    
                  vst2q_f32((float*)outputVectorPtr, output_vec);
    
                  outputVectorPtr += 4;
    
                  inputVectorPtr += 4;
    
              }
    
              // normalize phase so magnitude doesn't grow because of
    
              // floating point rounding error
    
              const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
    
              const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
    
              // Multiply complex with real
    
              phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
    
              phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
    
          }
    
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
    
              input_vec = vld2q_f32((float*)inputVectorPtr);
    
              // Prefetch next one, speeds things up
    
              __VOLK_PREFETCH(inputVectorPtr + 4);
    
              // Rotate
    
              output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
    
              // Increase phase
    
              phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
    
              // Store output
    
              vst2q_f32((float*)outputVectorPtr, output_vec);
    
              outputVectorPtr += 4;
    
              inputVectorPtr += 4;
    
          }
    
          // if(i) == true means we looped above
    
          if (i) {
    
              // normalize phase so magnitude doesn't grow because of
    
              // floating point rounding error
    
              const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
    
              const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
    
              // Multiply complex with real
    
              phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
    
              phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
    
          }
    
          // Store current phase
    
          vst2q_f32((float*)phasePtr, phase_vec);
    
          // Deal with the rest
    
          for (i = 0; i < num_points % 4; i++) {
    
              *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
    
              phasePtr[0] *= (phase_inc);
    
          }
    
          // For continuous phase next time we need to call this function
    
          (*phase) = phasePtr[0];
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
    
                                                                  const lv_32fc_t* inVector,
    
                                                                  const lv_32fc_t phase_inc,
    
                                                                  lv_32fc_t* phase,
    
                                                                  unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = outVector;
    
      2
          const lv_32fc_t* aPtr = inVector;
    
      2
          lv_32fc_t incr = 1;
    
      2
          lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
    
      2
          unsigned int i, j = 0;
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2 times.

      6
          for (i = 0; i < 2; ++i) {
    
      4
              phase_Ptr[i] *= incr;
    
      4
              incr *= (phase_inc);
    
          }
    
          __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
    
      2
          phase_Val = _mm_loadu_ps((float*)phase_Ptr);
    
      2
          inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
    
        2/2✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 510 times.

      131070
              for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
    
      130560
                  aVal = _mm_load_ps((float*)aPtr);
    
      130560
                  yl = _mm_moveldup_ps(phase_Val);
    
      130560
                  yh = _mm_movehdup_ps(phase_Val);
    
      130560
                  ylp = _mm_moveldup_ps(inc_Val);
    
      130560
                  yhp = _mm_movehdup_ps(inc_Val);
    
      130560
                  tmp1 = _mm_mul_ps(aVal, yl);
    
      130560
                  tmp1p = _mm_mul_ps(phase_Val, ylp);
    
      130560
                  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
    
      130560
                  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      130560
                  tmp2 = _mm_mul_ps(aVal, yh);
    
      130560
                  tmp2p = _mm_mul_ps(phase_Val, yhp);
    
      130560
                  z = _mm_addsub_ps(tmp1, tmp2);
    
      130560
                  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
    
                  _mm_store_ps((float*)cPtr, z);
    
      130560
                  aPtr += 2;
    
      130560
                  cPtr += 2;
    
              }
    
      510
              tmp1 = _mm_mul_ps(phase_Val, phase_Val);
    
      510
              tmp2 = _mm_hadd_ps(tmp1, tmp1);
    
      510
              tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
    
      510
              tmp2 = _mm_sqrt_ps(tmp1);
    
      510
              phase_Val = _mm_div_ps(phase_Val, tmp2);
    
          }
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
    
      510
              aVal = _mm_load_ps((float*)aPtr);
    
      510
              yl = _mm_moveldup_ps(phase_Val);
    
      510
              yh = _mm_movehdup_ps(phase_Val);
    
      510
              ylp = _mm_moveldup_ps(inc_Val);
    
      510
              yhp = _mm_movehdup_ps(inc_Val);
    
      510
              tmp1 = _mm_mul_ps(aVal, yl);
    
      510
              tmp1p = _mm_mul_ps(phase_Val, ylp);
    
      510
              aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
    
      510
              phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      510
              tmp2 = _mm_mul_ps(aVal, yh);
    
      510
              tmp2p = _mm_mul_ps(phase_Val, yhp);
    
      510
              z = _mm_addsub_ps(tmp1, tmp2);
    
      510
              phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
    
              _mm_store_ps((float*)cPtr, z);
    
      510
              aPtr += 2;
    
      510
              cPtr += 2;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (i) {
    
      2
              tmp1 = _mm_mul_ps(phase_Val, phase_Val);
    
      2
              tmp2 = _mm_hadd_ps(tmp1, tmp1);
    
      2
              tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
    
      2
              tmp2 = _mm_sqrt_ps(tmp1);
    
      2
              phase_Val = _mm_div_ps(phase_Val, tmp2);
    
          }
    
          _mm_storeu_ps((float*)phase_Ptr, phase_Val);
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_points & 1) {
    
      2
              *cPtr++ = *aPtr++ * phase_Ptr[0];
    
      2
              phase_Ptr[0] *= (phase_inc);
    
          }
    
      2
          (*phase) = phase_Ptr[0];
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 for aligned */
    
      #ifdef LV_HAVE_SSE4_1
    
      #include <smmintrin.h>
    
      2
      static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
    
                                                                  const lv_32fc_t* inVector,
    
                                                                  const lv_32fc_t phase_inc,
    
                                                                  lv_32fc_t* phase,
    
                                                                  unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = outVector;
    
      2
          const lv_32fc_t* aPtr = inVector;
    
      2
          lv_32fc_t incr = 1;
    
      2
          lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
    
      2
          unsigned int i, j = 0;
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2 times.

      6
          for (i = 0; i < 2; ++i) {
    
      4
              phase_Ptr[i] *= incr;
    
      4
              incr *= (phase_inc);
    
          }
    
          /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
    
          printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
    
          printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
    
          __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
    
      2
          phase_Val = _mm_loadu_ps((float*)phase_Ptr);
    
      2
          inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
    
        2/2✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 510 times.

      131070
              for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
    
      130560
                  aVal = _mm_loadu_ps((float*)aPtr);
    
      130560
                  yl = _mm_moveldup_ps(phase_Val);
    
      130560
                  yh = _mm_movehdup_ps(phase_Val);
    
      130560
                  ylp = _mm_moveldup_ps(inc_Val);
    
      130560
                  yhp = _mm_movehdup_ps(inc_Val);
    
      130560
                  tmp1 = _mm_mul_ps(aVal, yl);
    
      130560
                  tmp1p = _mm_mul_ps(phase_Val, ylp);
    
      130560
                  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
    
      130560
                  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      130560
                  tmp2 = _mm_mul_ps(aVal, yh);
    
      130560
                  tmp2p = _mm_mul_ps(phase_Val, yhp);
    
      130560
                  z = _mm_addsub_ps(tmp1, tmp2);
    
      130560
                  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
    
                  _mm_storeu_ps((float*)cPtr, z);
    
      130560
                  aPtr += 2;
    
      130560
                  cPtr += 2;
    
              }
    
      510
              tmp1 = _mm_mul_ps(phase_Val, phase_Val);
    
      510
              tmp2 = _mm_hadd_ps(tmp1, tmp1);
    
      510
              tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
    
      510
              tmp2 = _mm_sqrt_ps(tmp1);
    
      510
              phase_Val = _mm_div_ps(phase_Val, tmp2);
    
          }
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
    
      510
              aVal = _mm_loadu_ps((float*)aPtr);
    
      510
              yl = _mm_moveldup_ps(phase_Val);
    
      510
              yh = _mm_movehdup_ps(phase_Val);
    
      510
              ylp = _mm_moveldup_ps(inc_Val);
    
      510
              yhp = _mm_movehdup_ps(inc_Val);
    
      510
              tmp1 = _mm_mul_ps(aVal, yl);
    
      510
              tmp1p = _mm_mul_ps(phase_Val, ylp);
    
      510
              aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
    
      510
              phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      510
              tmp2 = _mm_mul_ps(aVal, yh);
    
      510
              tmp2p = _mm_mul_ps(phase_Val, yhp);
    
      510
              z = _mm_addsub_ps(tmp1, tmp2);
    
      510
              phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
    
              _mm_storeu_ps((float*)cPtr, z);
    
      510
              aPtr += 2;
    
      510
              cPtr += 2;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (i) {
    
      2
              tmp1 = _mm_mul_ps(phase_Val, phase_Val);
    
      2
              tmp2 = _mm_hadd_ps(tmp1, tmp1);
    
      2
              tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
    
      2
              tmp2 = _mm_sqrt_ps(tmp1);
    
      2
              phase_Val = _mm_div_ps(phase_Val, tmp2);
    
          }
    
          _mm_storeu_ps((float*)phase_Ptr, phase_Val);
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (num_points & 1) {
    
      2
              *cPtr++ = *aPtr++ * phase_Ptr[0];
    
      2
              phase_Ptr[0] *= (phase_inc);
    
          }
    
      2
          (*phase) = phase_Ptr[0];
    
      2
      }
    
      #endif /* LV_HAVE_SSE4_1 */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
    
                                                               const lv_32fc_t* inVector,
    
                                                               const lv_32fc_t phase_inc,
    
                                                               lv_32fc_t* phase,
    
                                                               unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = outVector;
    
      2
          const lv_32fc_t* aPtr = inVector;
    
      2
          lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
    
      2
          lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
    
      2
          unsigned int i, j = 0;
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.

      10
          for (i = 0; i < 4; ++i) {
    
      8
              phase_Ptr[i] *= incr;
    
      8
              incr *= (phase_inc);
    
          }
    
          __m256 aVal, phase_Val, z;
    
      2
          phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
    
      2
          const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
    
                                               lv_creal(incr),
    
                                               lv_cimag(incr),
    
                                               lv_creal(incr),
    
                                               lv_cimag(incr),
    
                                               lv_creal(incr),
    
                                               lv_cimag(incr),
    
                                               lv_creal(incr));
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
    
        2/2✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.

      65790
              for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
    
      65280
                  aVal = _mm256_load_ps((float*)aPtr);
    
      65280
                  z = _mm256_complexmul_ps(aVal, phase_Val);
    
      65280
                  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
    
                  _mm256_store_ps((float*)cPtr, z);
    
      65280
                  aPtr += 4;
    
      65280
                  cPtr += 4;
    
              }
    
      510
              phase_Val = _mm256_normalize_ps(phase_Val);
    
          }
    
        2/2✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.

      256
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
    
      254
              aVal = _mm256_load_ps((float*)aPtr);
    
      254
              z = _mm256_complexmul_ps(aVal, phase_Val);
    
      254
              phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
    
              _mm256_store_ps((float*)cPtr, z);
    
      254
              aPtr += 4;
    
      254
              cPtr += 4;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (i) {
    
      2
              phase_Val = _mm256_normalize_ps(phase_Val);
    
          }
    
          _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
    
      2
          (*phase) = phase_Ptr[0];
    
      2
          volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
    
      2
      }
    
      #endif /* LV_HAVE_AVX for aligned */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
    
                                                               const lv_32fc_t* inVector,
    
                                                               const lv_32fc_t phase_inc,
    
                                                               lv_32fc_t* phase,
    
                                                               unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = outVector;
    
      2
          const lv_32fc_t* aPtr = inVector;
    
      2
          lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
    
      2
          lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
    
      2
          unsigned int i, j = 0;
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.

      10
          for (i = 0; i < 4; ++i) {
    
      8
              phase_Ptr[i] *= incr;
    
      8
              incr *= (phase_inc);
    
          }
    
          __m256 aVal, phase_Val, z;
    
      2
          phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
    
      2
          const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
    
                                               lv_creal(incr),
    
                                               lv_cimag(incr),
    
                                               lv_creal(incr),
    
                                               lv_cimag(incr),
    
                                               lv_creal(incr),
    
                                               lv_cimag(incr),
    
                                               lv_creal(incr));
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
    
        2/2✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.

      65790
              for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
    
      65280
                  aVal = _mm256_loadu_ps((float*)aPtr);
    
      65280
                  z = _mm256_complexmul_ps(aVal, phase_Val);
    
      65280
                  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
    
                  _mm256_storeu_ps((float*)cPtr, z);
    
      65280
                  aPtr += 4;
    
      65280
                  cPtr += 4;
    
              }
    
      510
              phase_Val = _mm256_normalize_ps(phase_Val);
    
          }
    
        2/2✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.

      256
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
    
      254
              aVal = _mm256_loadu_ps((float*)aPtr);
    
      254
              z = _mm256_complexmul_ps(aVal, phase_Val);
    
      254
              phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
    
              _mm256_storeu_ps((float*)cPtr, z);
    
      254
              aPtr += 4;
    
      254
              cPtr += 4;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (i) {
    
      2
              phase_Val = _mm256_normalize_ps(phase_Val);
    
          }
    
          _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
    
      2
          (*phase) = phase_Ptr[0];
    
      2
          volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #if LV_HAVE_AVX && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
    
                                                                   const lv_32fc_t* inVector,
    
                                                                   const lv_32fc_t phase_inc,
    
                                                                   lv_32fc_t* phase,
    
                                                                   unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = outVector;
    
      2
          const lv_32fc_t* aPtr = inVector;
    
      2
          lv_32fc_t incr = 1;
    
          __VOLK_ATTR_ALIGNED(32)
    
      2
          lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
    
      2
          unsigned int i, j = 0;
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.

      10
          for (i = 0; i < 4; ++i) {
    
      8
              phase_Ptr[i] *= incr;
    
      8
              incr *= (phase_inc);
    
          }
    
          __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
    
      2
          phase_Val = _mm256_load_ps((float*)phase_Ptr);
    
      2
          inc_Val = _mm256_set_ps(lv_cimag(incr),
    
                                  lv_creal(incr),
    
                                  lv_cimag(incr),
    
                                  lv_creal(incr),
    
                                  lv_cimag(incr),
    
                                  lv_creal(incr),
    
                                  lv_cimag(incr),
    
                                  lv_creal(incr));
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
    
        2/2✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.

      65790
              for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
    
      65280
                  aVal = _mm256_load_ps((float*)aPtr);
    
      65280
                  yl = _mm256_moveldup_ps(phase_Val);
    
      65280
                  yh = _mm256_movehdup_ps(phase_Val);
    
      65280
                  ylp = _mm256_moveldup_ps(inc_Val);
    
      65280
                  yhp = _mm256_movehdup_ps(inc_Val);
    
      65280
                  tmp1 = aVal;
    
      65280
                  tmp1p = phase_Val;
    
      65280
                  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
    
      65280
                  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      65280
                  tmp2 = _mm256_mul_ps(aVal, yh);
    
      65280
                  tmp2p = _mm256_mul_ps(phase_Val, yhp);
    
      65280
                  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
    
      65280
                  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
    
                  _mm256_store_ps((float*)cPtr, z);
    
      65280
                  aPtr += 4;
    
      65280
                  cPtr += 4;
    
              }
    
      510
              tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
    
      510
              tmp2 = _mm256_hadd_ps(tmp1, tmp1);
    
      510
              tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
    
      510
              tmp2 = _mm256_sqrt_ps(tmp1);
    
      510
              phase_Val = _mm256_div_ps(phase_Val, tmp2);
    
          }
    
        2/2✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.

      256
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
    
      254
              aVal = _mm256_load_ps((float*)aPtr);
    
      254
              yl = _mm256_moveldup_ps(phase_Val);
    
      254
              yh = _mm256_movehdup_ps(phase_Val);
    
      254
              ylp = _mm256_moveldup_ps(inc_Val);
    
      254
              yhp = _mm256_movehdup_ps(inc_Val);
    
      254
              tmp1 = aVal;
    
      254
              tmp1p = phase_Val;
    
      254
              aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
    
      254
              phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      254
              tmp2 = _mm256_mul_ps(aVal, yh);
    
      254
              tmp2p = _mm256_mul_ps(phase_Val, yhp);
    
      254
              z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
    
      254
              phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
    
              _mm256_store_ps((float*)cPtr, z);
    
      254
              aPtr += 4;
    
      254
              cPtr += 4;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (i) {
    
      2
              tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
    
      2
              tmp2 = _mm256_hadd_ps(tmp1, tmp1);
    
      2
              tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
    
      2
              tmp2 = _mm256_sqrt_ps(tmp1);
    
      2
              phase_Val = _mm256_div_ps(phase_Val, tmp2);
    
          }
    
          _mm256_store_ps((float*)phase_Ptr, phase_Val);
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (i = 0; i < num_points % 4; ++i) {
    
      6
              *cPtr++ = *aPtr++ * phase_Ptr[0];
    
      6
              phase_Ptr[0] *= (phase_inc);
    
          }
    
      2
          (*phase) = phase_Ptr[0];
    
      2
      }
    
      #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
    
      #if LV_HAVE_AVX && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      2
      static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
    
                                                                   const lv_32fc_t* inVector,
    
                                                                   const lv_32fc_t phase_inc,
    
                                                                   lv_32fc_t* phase,
    
                                                                   unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = outVector;
    
      2
          const lv_32fc_t* aPtr = inVector;
    
      2
          lv_32fc_t incr = 1;
    
      2
          lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
    
      2
          unsigned int i, j = 0;
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.

      10
          for (i = 0; i < 4; ++i) {
    
      8
              phase_Ptr[i] *= incr;
    
      8
              incr *= (phase_inc);
    
          }
    
          __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
    
      2
          phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
    
      2
          inc_Val = _mm256_set_ps(lv_cimag(incr),
    
                                  lv_creal(incr),
    
                                  lv_cimag(incr),
    
                                  lv_creal(incr),
    
                                  lv_cimag(incr),
    
                                  lv_creal(incr),
    
                                  lv_cimag(incr),
    
                                  lv_creal(incr));
    
        2/2✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.

      512
          for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
    
        2/2✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.

      65790
              for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
    
      65280
                  aVal = _mm256_loadu_ps((float*)aPtr);
    
      65280
                  yl = _mm256_moveldup_ps(phase_Val);
    
      65280
                  yh = _mm256_movehdup_ps(phase_Val);
    
      65280
                  ylp = _mm256_moveldup_ps(inc_Val);
    
      65280
                  yhp = _mm256_movehdup_ps(inc_Val);
    
      65280
                  tmp1 = aVal;
    
      65280
                  tmp1p = phase_Val;
    
      65280
                  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
    
      65280
                  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      65280
                  tmp2 = _mm256_mul_ps(aVal, yh);
    
      65280
                  tmp2p = _mm256_mul_ps(phase_Val, yhp);
    
      65280
                  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
    
      65280
                  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
    
                  _mm256_storeu_ps((float*)cPtr, z);
    
      65280
                  aPtr += 4;
    
      65280
                  cPtr += 4;
    
              }
    
      510
              tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
    
      510
              tmp2 = _mm256_hadd_ps(tmp1, tmp1);
    
      510
              tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
    
      510
              tmp2 = _mm256_sqrt_ps(tmp1);
    
      510
              phase_Val = _mm256_div_ps(phase_Val, tmp2);
    
          }
    
        2/2✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.

      256
          for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
    
      254
              aVal = _mm256_loadu_ps((float*)aPtr);
    
      254
              yl = _mm256_moveldup_ps(phase_Val);
    
      254
              yh = _mm256_movehdup_ps(phase_Val);
    
      254
              ylp = _mm256_moveldup_ps(inc_Val);
    
      254
              yhp = _mm256_movehdup_ps(inc_Val);
    
      254
              tmp1 = aVal;
    
      254
              tmp1p = phase_Val;
    
      254
              aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
    
      254
              phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
    
      254
              tmp2 = _mm256_mul_ps(aVal, yh);
    
      254
              tmp2p = _mm256_mul_ps(phase_Val, yhp);
    
      254
              z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
    
      254
              phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
    
              _mm256_storeu_ps((float*)cPtr, z);
    
      254
              aPtr += 4;
    
      254
              cPtr += 4;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (i) {
    
      2
              tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
    
      2
              tmp2 = _mm256_hadd_ps(tmp1, tmp1);
    
      2
              tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
    
      2
              tmp2 = _mm256_sqrt_ps(tmp1);
    
      2
              phase_Val = _mm256_div_ps(phase_Val, tmp2);
    
          }
    
          _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (i = 0; i < num_points % 4; ++i) {
    
      6
              *cPtr++ = *aPtr++ * phase_Ptr[0];
    
      6
              phase_Ptr[0] *= (phase_inc);
    
          }
    
      2
          (*phase) = phase_Ptr[0];
    
      2
      }
    
      #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
    
      #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */