GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_x2_multiply_32fc.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	127	127	100.0%
Functions:	9	9	100.0%
Branches:	26	28	92.9%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_x2_multiply_32fc
    
       *
    
       * \b Overview
    
       *
    
       * Multiplies two complex vectors and returns the complex result.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
    
       * lv_32fc_t* bVector, unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: The first input vector of complex floats.
    
       * \li bVector: The second input vector of complex floats.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output vector complex floats.
    
       *
    
       * \b Example
    
       * Mix two signals at f=0.3 and 0.1.
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   lv_32fc_t* sig_1  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   lv_32fc_t* sig_2  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       // Generate two tones
    
       *       float real_1 = std::cos(0.3f * (float)ii);
    
       *       float imag_1 = std::sin(0.3f * (float)ii);
    
       *       sig_1[ii] = lv_cmake(real_1, imag_1);
    
       *       float real_2 = std::cos(0.1f * (float)ii);
    
       *       float imag_2 = std::sin(0.1f * (float)ii);
    
       *       sig_2[ii] = lv_cmake(real_2, imag_2);
    
       *   }
    
       *
    
       *   volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N);
    
       * *
    
       *   volk_free(sig_1);
    
       *   volk_free(sig_2);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
    
      #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
    
      #include <float.h>
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      /*!
    
        \brief Multiplies the two input complex vectors and stores their results in the third
    
        vector \param cVector The vector where the results will be stored \param aVector One of
    
        the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
    
        num_points The number of complex values in aVector and bVector to be multiplied together
    
        and stored into cVector
    
      */
    
      2
      static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
    
                                                               const lv_32fc_t* aVector,
    
                                                               const lv_32fc_t* bVector,
    
                                                               unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
              const __m256 x =
    
      65534
                  _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
              const __m256 y =
    
      65534
                  _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      65534
              const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
    
      65534
              const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
    
      65534
              const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
    
      65534
              const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
    
      65534
              const __m256 z = _mm256_fmaddsub_ps(
    
                  x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
    
              _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) * (*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
    
                                                          const lv_32fc_t* aVector,
    
                                                          const lv_32fc_t* bVector,
    
                                                          unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m256 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              x = _mm256_loadu_ps(
    
                  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
    
      65534
              y = _mm256_loadu_ps(
    
                  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
    
      65534
              z = _mm256_complexmul_ps(x, y);
    
              _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) * (*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <volk/volk_sse3_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
    
                                                           const lv_32fc_t* aVector,
    
                                                           const lv_32fc_t* bVector,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int halfPoints = num_points / 2;
    
          __m128 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.

      131072
          for (; number < halfPoints; number++) {
    
      131070
              x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
      131070
              y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      131070
              z = _mm_complexmul_ps(x, y);
    
              _mm_storeu_ps((float*)c, z); // Store the results back into the C container
    
      131070
              a += 2;
    
      131070
              b += 2;
    
      131070
              c += 2;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if ((num_points % 2) != 0) {
    
      2
              *c = (*a) * (*b);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
    
                                                            const lv_32fc_t* aVector,
    
                                                            const lv_32fc_t* bVector,
    
                                                            unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const lv_32fc_t* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) * (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
    
      #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
    
      #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
    
      #include <float.h>
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #if LV_HAVE_AVX2 && LV_HAVE_FMA
    
      #include <immintrin.h>
    
      /*!
    
        \brief Multiplies the two input complex vectors and stores their results in the third
    
        vector \param cVector The vector where the results will be stored \param aVector One of
    
        the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
    
        num_points The number of complex values in aVector and bVector to be multiplied together
    
        and stored into cVector
    
      */
    
      2
      static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
    
                                                               const lv_32fc_t* aVector,
    
                                                               const lv_32fc_t* bVector,
    
                                                               unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
              const __m256 x =
    
      65534
                  _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
              const __m256 y =
    
      65534
                  _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      65534
              const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
    
      65534
              const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
    
      65534
              const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
    
      65534
              const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
    
      65534
              const __m256 z = _mm256_fmaddsub_ps(
    
                  x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
    
              _mm256_store_ps((float*)c, z); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) * (*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
    
                                                          const lv_32fc_t* aVector,
    
                                                          const lv_32fc_t* bVector,
    
                                                          unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m256 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
    
      65534
              y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
    
      65534
              z = _mm256_complexmul_ps(x, y);
    
              _mm256_store_ps((float*)c, z); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) * (*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <volk/volk_sse3_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
    
                                                           const lv_32fc_t* aVector,
    
                                                           const lv_32fc_t* bVector,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int halfPoints = num_points / 2;
    
          __m128 x, y, z;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = aVector;
    
      2
          const lv_32fc_t* b = bVector;
    
        2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.

      131072
          for (; number < halfPoints; number++) {
    
      131070
              x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
      131070
              y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      131070
              z = _mm_complexmul_ps(x, y);
    
              _mm_store_ps((float*)c, z); // Store the results back into the C container
    
      131070
              a += 2;
    
      131070
              b += 2;
    
      131070
              c += 2;
    
          }
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if ((num_points % 2) != 0) {
    
      2
              *c = (*a) * (*b);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
    
                                                              const lv_32fc_t* aVector,
    
                                                              const lv_32fc_t* bVector,
    
                                                              unsigned int num_points)
    
      {
    
      2
          lv_32fc_t* cPtr = cVector;
    
      2
          const lv_32fc_t* aPtr = aVector;
    
      2
          const lv_32fc_t* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) * (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
    
                                                         const lv_32fc_t* aVector,
    
                                                         const lv_32fc_t* bVector,
    
                                                         unsigned int num_points)
    
      {
    
          lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
    
          lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
    
          unsigned int quarter_points = num_points / 4;
    
          float32x4x2_t a_val, b_val, c_val;
    
          float32x4x2_t tmp_real, tmp_imag;
    
          unsigned int number = 0;
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
              b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
              __VOLK_PREFETCH(a_ptr + 4);
    
              __VOLK_PREFETCH(b_ptr + 4);
    
              // multiply the real*real and imag*imag to get real result
    
              // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
    
              tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
    
              // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
    
              tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
    
              // Multiply cross terms to get the imaginary result
    
              // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
    
              tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
    
              // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
    
              tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
    
              // store the results
    
              c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
    
              c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
    
              vst2q_f32((float*)cVector, c_val);
    
              a_ptr += 4;
    
              b_ptr += 4;
    
              cVector += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *cVector++ = (*a_ptr++) * (*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_NEON
    
      static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
    
                                                                  const lv_32fc_t* aVector,
    
                                                                  const lv_32fc_t* bVector,
    
                                                                  unsigned int num_points)
    
      {
    
          lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
    
          lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
    
          unsigned int quarter_points = num_points / 4;
    
          float32x4x2_t a_val, b_val;
    
          float32x4x2_t tmp_imag;
    
          unsigned int number = 0;
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
              b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
              __VOLK_PREFETCH(a_ptr + 4);
    
              __VOLK_PREFETCH(b_ptr + 4);
    
              // do the first multiply
    
              tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
    
              tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
    
              // use multiply accumulate/subtract to get result
    
              tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
    
              tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
    
              // store
    
              vst2q_f32((float*)cVector, tmp_imag);
    
              // increment pointers
    
              a_ptr += 4;
    
              b_ptr += 4;
    
              cVector += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *cVector++ = (*a_ptr++) * (*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_NEONV7
    
      extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
    
                                                       const lv_32fc_t* aVector,
    
                                                       const lv_32fc_t* bVector,
    
                                                       unsigned int num_points);
    
      #endif /* LV_HAVE_NEONV7 */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
    
                                                        const lv_32fc_t* aVector,
    
                                                        const lv_32fc_t* bVector,
    
                                                        unsigned int num_points);
    
      2
      static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
    
                                                          const lv_32fc_t* aVector,
    
                                                          const lv_32fc_t* bVector,
    
                                                          unsigned int num_points)
    
      {
    
      2
          volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_ORC */
    
      #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_x2_multiply_32fc
12			*
13			* \b Overview
14			*
15			* Multiplies two complex vectors and returns the complex result.
16			*
17			* <b>Dispatcher Prototype</b>
18			* \code
19			* void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
20			* lv_32fc_t* bVector, unsigned int num_points); \endcode
21			*
22			* \b Inputs
23			* \li aVector: The first input vector of complex floats.
24			* \li bVector: The second input vector of complex floats.
25			* \li num_points: The number of data points.
26			*
27			* \b Outputs
28			* \li outputVector: The output vector complex floats.
29			*
30			* \b Example
31			* Mix two signals at f=0.3 and 0.1.
32			* \code
33			* int N = 10;
34			* unsigned int alignment = volk_get_alignment();
35			* lv_32fc_t* sig_1 = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
36			* lv_32fc_t* sig_2 = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
37			* lv_32fc_t* out = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
38			*
39			* for(unsigned int ii = 0; ii < N; ++ii){
40			* // Generate two tones
41			* float real_1 = std::cos(0.3f * (float)ii);
42			* float imag_1 = std::sin(0.3f * (float)ii);
43			* sig_1[ii] = lv_cmake(real_1, imag_1);
44			* float real_2 = std::cos(0.1f * (float)ii);
45			* float imag_2 = std::sin(0.1f * (float)ii);
46			* sig_2[ii] = lv_cmake(real_2, imag_2);
47			* }
48			*
49			* volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N);
50			* *
51			* volk_free(sig_1);
52			* volk_free(sig_2);
53			* volk_free(out);
54			* \endcode
55			*/
56
57			#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
58			#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
59
60			#include <float.h>
61			#include <inttypes.h>
62			#include <stdio.h>
63			#include <volk/volk_complex.h>
64
65			#if LV_HAVE_AVX2 && LV_HAVE_FMA
66			#include <immintrin.h>
67			/*!
68			\brief Multiplies the two input complex vectors and stores their results in the third
69			vector \param cVector The vector where the results will be stored \param aVector One of
70			the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
71			num_points The number of complex values in aVector and bVector to be multiplied together
72			and stored into cVector
73			*/
74		2	static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
75			const lv_32fc_t* aVector,
76			const lv_32fc_t* bVector,
77			unsigned int num_points)
78			{
79		2	unsigned int number = 0;
80		2	const unsigned int quarterPoints = num_points / 4;
81
82		2	lv_32fc_t* c = cVector;
83		2	const lv_32fc_t* a = aVector;
84		2	const lv_32fc_t* b = bVector;
85
86	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
87
88			const __m256 x =
89		65534	_mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
90			const __m256 y =
91		65534	_mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
92
93		65534	const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
94		65534	const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
95
96		65534	const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
97
98		65534	const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = aici,arci,bidi,brdi
99
100		65534	const __m256 z = _mm256_fmaddsub_ps(
101			x, yl, tmp2); // arcr-aici, aicr+arci, brdr-bidi, bidr+brdi
102
103			_mm256_storeu_ps((float*)c, z); // Store the results back into the C container
104
105		65534	a += 4;
106		65534	b += 4;
107		65534	c += 4;
108			}
109
110		2	number = quarterPoints * 4;
111	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
112		6	c++ = (a++) * (*b++);
113			}
114		2	}
115			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
116
117
118			#ifdef LV_HAVE_AVX
119			#include <immintrin.h>
120			#include <volk/volk_avx_intrinsics.h>
121
122		2	static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
123			const lv_32fc_t* aVector,
124			const lv_32fc_t* bVector,
125			unsigned int num_points)
126			{
127		2	unsigned int number = 0;
128		2	const unsigned int quarterPoints = num_points / 4;
129
130			__m256 x, y, z;
131		2	lv_32fc_t* c = cVector;
132		2	const lv_32fc_t* a = aVector;
133		2	const lv_32fc_t* b = bVector;
134
135	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
136		65534	x = _mm256_loadu_ps(
137			(float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
138		65534	y = _mm256_loadu_ps(
139			(float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
140		65534	z = _mm256_complexmul_ps(x, y);
141			_mm256_storeu_ps((float*)c, z); // Store the results back into the C container
142
143		65534	a += 4;
144		65534	b += 4;
145		65534	c += 4;
146			}
147
148		2	number = quarterPoints * 4;
149
150	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
151		6	c++ = (a++) * (*b++);
152			}
153		2	}
154			#endif /* LV_HAVE_AVX */
155
156
157			#ifdef LV_HAVE_SSE3
158			#include <pmmintrin.h>
159			#include <volk/volk_sse3_intrinsics.h>
160
161		2	static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
162			const lv_32fc_t* aVector,
163			const lv_32fc_t* bVector,
164			unsigned int num_points)
165			{
166		2	unsigned int number = 0;
167		2	const unsigned int halfPoints = num_points / 2;
168
169			__m128 x, y, z;
170		2	lv_32fc_t* c = cVector;
171		2	const lv_32fc_t* a = aVector;
172		2	const lv_32fc_t* b = bVector;
173
174	2/2 ✓ Branch 0 taken 131070 times. ✓ Branch 1 taken 2 times.	131072	for (; number < halfPoints; number++) {
175		131070	x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
176		131070	y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
177		131070	z = _mm_complexmul_ps(x, y);
178			_mm_storeu_ps((float*)c, z); // Store the results back into the C container
179
180		131070	a += 2;
181		131070	b += 2;
182		131070	c += 2;
183			}
184
185	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if ((num_points % 2) != 0) {
186		2	c = (a) * (*b);
187			}
188		2	}
189			#endif /* LV_HAVE_SSE */
190
191
192			#ifdef LV_HAVE_GENERIC
193
194		2	static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
195			const lv_32fc_t* aVector,
196			const lv_32fc_t* bVector,
197			unsigned int num_points)
198			{
199		2	lv_32fc_t* cPtr = cVector;
200		2	const lv_32fc_t* aPtr = aVector;
201		2	const lv_32fc_t* bPtr = bVector;
202		2	unsigned int number = 0;
203
204	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
205		262142	cPtr++ = (aPtr++) * (*bPtr++);
206			}
207		2	}
208			#endif /* LV_HAVE_GENERIC */
209
210
211			#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
212			#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
213			#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
214
215			#include <float.h>
216			#include <inttypes.h>
217			#include <stdio.h>
218			#include <volk/volk_complex.h>
219
220			#if LV_HAVE_AVX2 && LV_HAVE_FMA
221			#include <immintrin.h>
222			/*!
223			\brief Multiplies the two input complex vectors and stores their results in the third
224			vector \param cVector The vector where the results will be stored \param aVector One of
225			the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
226			num_points The number of complex values in aVector and bVector to be multiplied together
227			and stored into cVector
228			*/
229		2	static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
230			const lv_32fc_t* aVector,
231			const lv_32fc_t* bVector,
232			unsigned int num_points)
233			{
234		2	unsigned int number = 0;
235		2	const unsigned int quarterPoints = num_points / 4;
236
237		2	lv_32fc_t* c = cVector;
238		2	const lv_32fc_t* a = aVector;
239		2	const lv_32fc_t* b = bVector;
240
241	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
242
243			const __m256 x =
244		65534	_mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
245			const __m256 y =
246		65534	_mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
247
248		65534	const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
249		65534	const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
250
251		65534	const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
252
253		65534	const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = aici,arci,bidi,brdi
254
255		65534	const __m256 z = _mm256_fmaddsub_ps(
256			x, yl, tmp2); // arcr-aici, aicr+arci, brdr-bidi, bidr+brdi
257
258			_mm256_store_ps((float*)c, z); // Store the results back into the C container
259
260		65534	a += 4;
261		65534	b += 4;
262		65534	c += 4;
263			}
264
265		2	number = quarterPoints * 4;
266	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
267		6	c++ = (a++) * (*b++);
268			}
269		2	}
270			#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
271
272
273			#ifdef LV_HAVE_AVX
274			#include <immintrin.h>
275			#include <volk/volk_avx_intrinsics.h>
276
277		2	static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
278			const lv_32fc_t* aVector,
279			const lv_32fc_t* bVector,
280			unsigned int num_points)
281			{
282		2	unsigned int number = 0;
283		2	const unsigned int quarterPoints = num_points / 4;
284
285			__m256 x, y, z;
286		2	lv_32fc_t* c = cVector;
287		2	const lv_32fc_t* a = aVector;
288		2	const lv_32fc_t* b = bVector;
289
290	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
291		65534	x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
292		65534	y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
293		65534	z = _mm256_complexmul_ps(x, y);
294			_mm256_store_ps((float*)c, z); // Store the results back into the C container
295
296		65534	a += 4;
297		65534	b += 4;
298		65534	c += 4;
299			}
300
301		2	number = quarterPoints * 4;
302
303	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
304		6	c++ = (a++) * (*b++);
305			}
306		2	}
307			#endif /* LV_HAVE_AVX */
308
309			#ifdef LV_HAVE_SSE3
310			#include <pmmintrin.h>
311			#include <volk/volk_sse3_intrinsics.h>
312
313		2	static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
314			const lv_32fc_t* aVector,
315			const lv_32fc_t* bVector,
316			unsigned int num_points)
317			{
318		2	unsigned int number = 0;
319		2	const unsigned int halfPoints = num_points / 2;
320
321			__m128 x, y, z;
322		2	lv_32fc_t* c = cVector;
323		2	const lv_32fc_t* a = aVector;
324		2	const lv_32fc_t* b = bVector;
325
326	2/2 ✓ Branch 0 taken 131070 times. ✓ Branch 1 taken 2 times.	131072	for (; number < halfPoints; number++) {
327		131070	x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
328		131070	y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
329		131070	z = _mm_complexmul_ps(x, y);
330			_mm_store_ps((float*)c, z); // Store the results back into the C container
331
332		131070	a += 2;
333		131070	b += 2;
334		131070	c += 2;
335			}
336
337	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if ((num_points % 2) != 0) {
338		2	c = (a) * (*b);
339			}
340		2	}
341			#endif /* LV_HAVE_SSE */
342
343
344			#ifdef LV_HAVE_GENERIC
345
346		2	static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
347			const lv_32fc_t* aVector,
348			const lv_32fc_t* bVector,
349			unsigned int num_points)
350			{
351		2	lv_32fc_t* cPtr = cVector;
352		2	const lv_32fc_t* aPtr = aVector;
353		2	const lv_32fc_t* bPtr = bVector;
354		2	unsigned int number = 0;
355
356	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
357		262142	cPtr++ = (aPtr++) * (*bPtr++);
358			}
359		2	}
360			#endif /* LV_HAVE_GENERIC */
361
362
363			#ifdef LV_HAVE_NEON
364			#include <arm_neon.h>
365
366			static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
367			const lv_32fc_t* aVector,
368			const lv_32fc_t* bVector,
369			unsigned int num_points)
370			{
371			lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
372			lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
373			unsigned int quarter_points = num_points / 4;
374			float32x4x2_t a_val, b_val, c_val;
375			float32x4x2_t tmp_real, tmp_imag;
376			unsigned int number = 0;
377
378			for (number = 0; number < quarter_points; ++number) {
379			a_val = vld2q_f32((float*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
380			b_val = vld2q_f32((float*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
381			__VOLK_PREFETCH(a_ptr + 4);
382			__VOLK_PREFETCH(b_ptr + 4);
383
384			// multiply the realreal and imagimag to get real result
385			// a0rb0r\|a1rb1r\|a2rb2r\|a3rb3r
386			tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
387			// a0ib0i\|a1ib1i\|a2ib2i\|a3ib3i
388			tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
389
390			// Multiply cross terms to get the imaginary result
391			// a0rb0i\|a1rb1i\|a2rb2i\|a3rb3i
392			tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
393			// a0ib0r\|a1ib1r\|a2ib2r\|a3ib3r
394			tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
395
396			// store the results
397			c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
398			c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
399			vst2q_f32((float*)cVector, c_val);
400
401			a_ptr += 4;
402			b_ptr += 4;
403			cVector += 4;
404			}
405
406			for (number = quarter_points * 4; number < num_points; number++) {
407			cVector++ = (a_ptr++) * (*b_ptr++);
408			}
409			}
410			#endif /* LV_HAVE_NEON */
411
412
413			#ifdef LV_HAVE_NEON
414
415			static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
416			const lv_32fc_t* aVector,
417			const lv_32fc_t* bVector,
418			unsigned int num_points)
419			{
420			lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
421			lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
422			unsigned int quarter_points = num_points / 4;
423			float32x4x2_t a_val, b_val;
424			float32x4x2_t tmp_imag;
425			unsigned int number = 0;
426
427			for (number = 0; number < quarter_points; ++number) {
428			a_val = vld2q_f32((float*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
429			b_val = vld2q_f32((float*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
430			__VOLK_PREFETCH(a_ptr + 4);
431			__VOLK_PREFETCH(b_ptr + 4);
432
433			// do the first multiply
434			tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
435			tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
436
437			// use multiply accumulate/subtract to get result
438			tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
439			tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
440
441			// store
442			vst2q_f32((float*)cVector, tmp_imag);
443			// increment pointers
444			a_ptr += 4;
445			b_ptr += 4;
446			cVector += 4;
447			}
448
449			for (number = quarter_points * 4; number < num_points; number++) {
450			cVector++ = (a_ptr++) * (*b_ptr++);
451			}
452			}
453			#endif /* LV_HAVE_NEON */
454
455
456			#ifdef LV_HAVE_NEONV7
457
458			extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
459			const lv_32fc_t* aVector,
460			const lv_32fc_t* bVector,
461			unsigned int num_points);
462			#endif /* LV_HAVE_NEONV7 */
463
464
465			#ifdef LV_HAVE_ORC
466
467			extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
468			const lv_32fc_t* aVector,
469			const lv_32fc_t* bVector,
470			unsigned int num_points);
471
472		2	static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
473			const lv_32fc_t* aVector,
474			const lv_32fc_t* bVector,
475			unsigned int num_points)
476			{
477		2	volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
478		2	}
479
480			#endif /* LV_HAVE_ORC */
481
482			#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
483