GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32fc_x2_divide_32fc.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	115	115	100.0%
Functions:	5	5	100.0%
Branches:	16	16	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2016 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32fc_x2_divide_32fc
    
       *
    
       * \b Overview
    
       *
    
       * Divide first vector of complexes element-wise by second.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
    
       * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li numeratorVector: The numerator complex values.
    
       * \li numeratorVector: The denumerator complex values.
    
       * \li num_points: The number of data points.
    
       *
    
       * \b Outputs
    
       * \li outputVector: The output vector complex floats.
    
       *
    
       * \b Example
    
       * divide a complex vector by itself, demonstrating the result should be pretty close to
    
       * 1+0j.
    
       *
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   lv_32fc_t* input_vector  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
    
       *
    
       *   float delta = 2.f*M_PI / (float)N;
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       float real_1 = std::cos(0.3f * (float)ii);
    
       *       float imag_1 = std::sin(0.3f * (float)ii);
    
       *       input_vector[ii] = lv_cmake(real_1, imag_1);
    
       *   }
    
       *
    
       *   volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
    
       *   }
    
       *   printf("\n");
    
       *
    
       *   volk_free(input_vector);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
    
      #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
    
      #include <float.h>
    
      #include <inttypes.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_GENERIC
    
      4
      static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
    
                                                          const lv_32fc_t* aVector,
    
                                                          const lv_32fc_t* bVector,
    
                                                          unsigned int num_points)
    
      {
    
      4
          lv_32fc_t* cPtr = cVector;
    
      4
          const lv_32fc_t* aPtr = aVector;
    
      4
          const lv_32fc_t* bPtr = bVector;
    
        2/2✓ Branch 0 taken 262156 times.
✓ Branch 1 taken 4 times.

      262160
          for (unsigned int number = 0; number < num_points; number++) {
    
      262156
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      4
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <volk/volk_sse3_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
    
                                                         const lv_32fc_t* numeratorVector,
    
                                                         const lv_32fc_t* denumeratorVector,
    
                                                         unsigned int num_points)
    
      {
    
          /*
    
           * we'll do the "classical"
    
           *  a      a b*
    
           * --- = -------
    
           *  b     |b|^2
    
           * */
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 num01, num23, den01, den23, norm, result;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = numeratorVector;
    
      2
          const lv_32fc_t* b = denumeratorVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              num01 = _mm_loadu_ps((float*)a);                  // first pair
    
      65534
              den01 = _mm_loadu_ps((float*)b);                  // first pair
    
      65534
              num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
    
      65534
              a += 2;
    
      65534
              b += 2;
    
      65534
              num23 = _mm_loadu_ps((float*)a);                  // second pair
    
      65534
              den23 = _mm_loadu_ps((float*)b);                  // second pair
    
      65534
              num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
    
      65534
              a += 2;
    
      65534
              b += 2;
    
      65534
              norm = _mm_magnitudesquared_ps_sse3(den01, den23);
    
      65534
              den01 = _mm_unpacklo_ps(norm, norm);
    
      65534
              den23 = _mm_unpackhi_ps(norm, norm);
    
      65534
              result = _mm_div_ps(num01, den01);
    
              _mm_storeu_ps((float*)c, result); // Store the results back into the C container
    
      65534
              c += 2;
    
      65534
              result = _mm_div_ps(num23, den23);
    
              _mm_storeu_ps((float*)c, result); // Store the results back into the C container
    
      65534
              c += 2;
    
          }
    
      2
          number *= 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c = (*a) / (*b);
    
      6
              a++;
    
      6
              b++;
    
      6
              c++;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE3 */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
    
                                                        const lv_32fc_t* numeratorVector,
    
                                                        const lv_32fc_t* denumeratorVector,
    
                                                        unsigned int num_points)
    
      {
    
          /*
    
           * we'll do the "classical"
    
           *  a      a b*
    
           * --- = -------
    
           *  b     |b|^2
    
           * */
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = numeratorVector;
    
      2
          const lv_32fc_t* b = denumeratorVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              num = _mm256_loadu_ps(
    
                  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
    
      65534
              denum = _mm256_loadu_ps(
    
                  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
    
      65534
              mul_conj = _mm256_complexconjugatemul_ps(num, denum);
    
      65534
              sq = _mm256_mul_ps(denum, denum); // Square the values
    
      65534
              mag_sq_un = _mm256_hadd_ps(
    
                  sq, sq); // obtain the actual squared magnitude, although out of order
    
      65534
              mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
    
              // best guide I found on using these functions:
    
              // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
    
      65534
              div = _mm256_div_ps(mul_conj, mag_sq);
    
              _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
    
      65534
              a += 4;
    
      65534
              b += 4;
    
      65534
              c += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c++ = (*a++) / (*b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
    
      #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
    
      #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
    
      #include <float.h>
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_SSE3
    
      #include <pmmintrin.h>
    
      #include <volk/volk_sse3_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
    
                                                         const lv_32fc_t* numeratorVector,
    
                                                         const lv_32fc_t* denumeratorVector,
    
                                                         unsigned int num_points)
    
      {
    
          /*
    
           * we'll do the "classical"
    
           *  a      a b*
    
           * --- = -------
    
           *  b     |b|^2
    
           * */
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
          __m128 num01, num23, den01, den23, norm, result;
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = numeratorVector;
    
      2
          const lv_32fc_t* b = denumeratorVector;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              num01 = _mm_load_ps((float*)a);                   // first pair
    
      65534
              den01 = _mm_load_ps((float*)b);                   // first pair
    
      65534
              num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
    
      65534
              a += 2;
    
      65534
              b += 2;
    
      65534
              num23 = _mm_load_ps((float*)a);                   // second pair
    
      65534
              den23 = _mm_load_ps((float*)b);                   // second pair
    
      65534
              num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
    
      65534
              a += 2;
    
      65534
              b += 2;
    
      65534
              norm = _mm_magnitudesquared_ps_sse3(den01, den23);
    
      65534
              den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
    
      65534
              den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
    
      65534
              result = _mm_div_ps(num01, den01);
    
              _mm_store_ps((float*)c, result); // Store the results back into the C container
    
      65534
              c += 2;
    
      65534
              result = _mm_div_ps(num23, den23);
    
              _mm_store_ps((float*)c, result); // Store the results back into the C container
    
      65534
              c += 2;
    
          }
    
      2
          number *= 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *c = (*a) / (*b);
    
      6
              a++;
    
      6
              b++;
    
      6
              c++;
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      #include <volk/volk_avx_intrinsics.h>
    
      2
      static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
    
                                                        const lv_32fc_t* numeratorVector,
    
                                                        const lv_32fc_t* denumeratorVector,
    
                                                        unsigned int num_points)
    
      {
    
          /*
    
           * Guide to AVX intrisics:
    
           * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
    
           *
    
           * we'll do the "classical"
    
           *  a      a b*
    
           * --- = -------
    
           *  b     |b|^2
    
           *
    
           */
    
      2
          lv_32fc_t* c = cVector;
    
      2
          const lv_32fc_t* a = numeratorVector;
    
      2
          const lv_32fc_t* b = denumeratorVector;
    
      2
          const unsigned int eigthPoints = num_points / 8;
    
          __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (unsigned int number = 0; number < eigthPoints; number++) {
    
              // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
    
      32766
              num01 = _mm256_load_ps((float*)a);
    
      32766
              denum01 = _mm256_load_ps((float*)b);
    
      32766
              num01 = _mm256_complexconjugatemul_ps(num01, denum01);
    
      32766
              a += 4;
    
      32766
              b += 4;
    
      32766
              num23 = _mm256_load_ps((float*)a);
    
      32766
              denum23 = _mm256_load_ps((float*)b);
    
      32766
              num23 = _mm256_complexconjugatemul_ps(num23, denum23);
    
      32766
              a += 4;
    
      32766
              b += 4;
    
      65532
              complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
    
                                              _mm256_mul_ps(denum23, denum23));
    
      32766
              denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
    
      32766
              denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
    
      32766
              result0 = _mm256_div_ps(num01, denum01);
    
      32766
              result1 = _mm256_div_ps(num23, denum23);
    
              _mm256_store_ps((float*)c, result0);
    
      32766
              c += 4;
    
              _mm256_store_ps((float*)c, result1);
    
      32766
              c += 4;
    
          }
    
      2
          volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
    
                                                       const lv_32fc_t* aVector,
    
                                                       const lv_32fc_t* bVector,
    
                                                       unsigned int num_points)
    
      {
    
          lv_32fc_t* cPtr = cVector;
    
          const lv_32fc_t* aPtr = aVector;
    
          const lv_32fc_t* bPtr = bVector;
    
          float32x4x2_t aVal, bVal, cVal;
    
          float32x4_t bAbs, bAbsInv;
    
          const unsigned int quarterPoints = num_points / 4;
    
          unsigned int number = 0;
    
          for (; number < quarterPoints; number++) {
    
              aVal = vld2q_f32((const float*)(aPtr));
    
              bVal = vld2q_f32((const float*)(bPtr));
    
              aPtr += 4;
    
              bPtr += 4;
    
              __VOLK_PREFETCH(aPtr + 4);
    
              __VOLK_PREFETCH(bPtr + 4);
    
              bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
    
              bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
    
              bAbsInv = vrecpeq_f32(bAbs);
    
              bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
    
              bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
    
              cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
    
              cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
    
              cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
    
              cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
    
              cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
    
              cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
    
              vst2q_f32((float*)(cPtr), cVal);
    
              cPtr += 4;
    
          }
    
          for (number = quarterPoints * 4; number < num_points; number++) {
    
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2016 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32fc_x2_divide_32fc
12			*
13			* \b Overview
14			*
15			* Divide first vector of complexes element-wise by second.
16			*
17			* <b>Dispatcher Prototype</b>
18			* \code
19			* void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
20			* const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
21			*
22			* \b Inputs
23			* \li numeratorVector: The numerator complex values.
24			* \li numeratorVector: The denumerator complex values.
25			* \li num_points: The number of data points.
26			*
27			* \b Outputs
28			* \li outputVector: The output vector complex floats.
29			*
30			* \b Example
31			* divide a complex vector by itself, demonstrating the result should be pretty close to
32			* 1+0j.
33			*
34			* \code
35			* int N = 10;
36			* unsigned int alignment = volk_get_alignment();
37			* lv_32fc_t* input_vector = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
38			* lv_32fc_t* out = (lv_32fc_t)volk_malloc(sizeof(lv_32fc_t)N, alignment);
39			*
40			* float delta = 2.f*M_PI / (float)N;
41			* for(unsigned int ii = 0; ii < N; ++ii){
42			* float real_1 = std::cos(0.3f * (float)ii);
43			* float imag_1 = std::sin(0.3f * (float)ii);
44			* input_vector[ii] = lv_cmake(real_1, imag_1);
45			* }
46			*
47			* volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
48			*
49			* for(unsigned int ii = 0; ii < N; ++ii){
50			* printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
51			* }
52			* printf("\n");
53			*
54			* volk_free(input_vector);
55			* volk_free(out);
56			* \endcode
57			*/
58
59			#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
60			#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
61
62			#include <float.h>
63			#include <inttypes.h>
64			#include <volk/volk_complex.h>
65
66
67			#ifdef LV_HAVE_GENERIC
68
69		4	static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
70			const lv_32fc_t* aVector,
71			const lv_32fc_t* bVector,
72			unsigned int num_points)
73			{
74		4	lv_32fc_t* cPtr = cVector;
75		4	const lv_32fc_t* aPtr = aVector;
76		4	const lv_32fc_t* bPtr = bVector;
77
78	2/2 ✓ Branch 0 taken 262156 times. ✓ Branch 1 taken 4 times.	262160	for (unsigned int number = 0; number < num_points; number++) {
79		262156	cPtr++ = (aPtr++) / (*bPtr++);
80			}
81		4	}
82			#endif /* LV_HAVE_GENERIC */
83
84
85			#ifdef LV_HAVE_SSE3
86			#include <pmmintrin.h>
87			#include <volk/volk_sse3_intrinsics.h>
88
89		2	static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
90			const lv_32fc_t* numeratorVector,
91			const lv_32fc_t* denumeratorVector,
92			unsigned int num_points)
93			{
94			/*
95			* we'll do the "classical"
96			* a a b*
97			* --- = -------
98			* b \|b\|^2
99			* */
100		2	unsigned int number = 0;
101		2	const unsigned int quarterPoints = num_points / 4;
102
103			__m128 num01, num23, den01, den23, norm, result;
104		2	lv_32fc_t* c = cVector;
105		2	const lv_32fc_t* a = numeratorVector;
106		2	const lv_32fc_t* b = denumeratorVector;
107
108	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
109		65534	num01 = _mm_loadu_ps((float*)a); // first pair
110		65534	den01 = _mm_loadu_ps((float*)b); // first pair
111		65534	num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
112		65534	a += 2;
113		65534	b += 2;
114
115		65534	num23 = _mm_loadu_ps((float*)a); // second pair
116		65534	den23 = _mm_loadu_ps((float*)b); // second pair
117		65534	num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
118		65534	a += 2;
119		65534	b += 2;
120
121		65534	norm = _mm_magnitudesquared_ps_sse3(den01, den23);
122		65534	den01 = _mm_unpacklo_ps(norm, norm);
123		65534	den23 = _mm_unpackhi_ps(norm, norm);
124
125		65534	result = _mm_div_ps(num01, den01);
126			_mm_storeu_ps((float*)c, result); // Store the results back into the C container
127		65534	c += 2;
128		65534	result = _mm_div_ps(num23, den23);
129			_mm_storeu_ps((float*)c, result); // Store the results back into the C container
130		65534	c += 2;
131			}
132
133		2	number *= 4;
134	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
135		6	c = (a) / (*b);
136		6	a++;
137		6	b++;
138		6	c++;
139			}
140		2	}
141			#endif /* LV_HAVE_SSE3 */
142
143
144			#ifdef LV_HAVE_AVX
145			#include <immintrin.h>
146			#include <volk/volk_avx_intrinsics.h>
147
148		2	static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
149			const lv_32fc_t* numeratorVector,
150			const lv_32fc_t* denumeratorVector,
151			unsigned int num_points)
152			{
153			/*
154			* we'll do the "classical"
155			* a a b*
156			* --- = -------
157			* b \|b\|^2
158			* */
159		2	unsigned int number = 0;
160		2	const unsigned int quarterPoints = num_points / 4;
161
162			__m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
163		2	lv_32fc_t* c = cVector;
164		2	const lv_32fc_t* a = numeratorVector;
165		2	const lv_32fc_t* b = denumeratorVector;
166
167	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
168		65534	num = _mm256_loadu_ps(
169			(float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
170		65534	denum = _mm256_loadu_ps(
171			(float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
172		65534	mul_conj = _mm256_complexconjugatemul_ps(num, denum);
173		65534	sq = _mm256_mul_ps(denum, denum); // Square the values
174		65534	mag_sq_un = _mm256_hadd_ps(
175			sq, sq); // obtain the actual squared magnitude, although out of order
176		65534	mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
177			// best guide I found on using these functions:
178			// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
179		65534	div = _mm256_div_ps(mul_conj, mag_sq);
180
181			_mm256_storeu_ps((float*)c, div); // Store the results back into the C container
182
183		65534	a += 4;
184		65534	b += 4;
185		65534	c += 4;
186			}
187
188		2	number = quarterPoints * 4;
189
190	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
191		6	c++ = (a++) / (*b++);
192			}
193		2	}
194			#endif /* LV_HAVE_AVX */
195
196
197			#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
198
199
200			#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
201			#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
202
203			#include <float.h>
204			#include <inttypes.h>
205			#include <stdio.h>
206			#include <volk/volk_complex.h>
207
208			#ifdef LV_HAVE_SSE3
209			#include <pmmintrin.h>
210			#include <volk/volk_sse3_intrinsics.h>
211
212		2	static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
213			const lv_32fc_t* numeratorVector,
214			const lv_32fc_t* denumeratorVector,
215			unsigned int num_points)
216			{
217			/*
218			* we'll do the "classical"
219			* a a b*
220			* --- = -------
221			* b \|b\|^2
222			* */
223		2	unsigned int number = 0;
224		2	const unsigned int quarterPoints = num_points / 4;
225
226			__m128 num01, num23, den01, den23, norm, result;
227		2	lv_32fc_t* c = cVector;
228		2	const lv_32fc_t* a = numeratorVector;
229		2	const lv_32fc_t* b = denumeratorVector;
230
231	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
232		65534	num01 = _mm_load_ps((float*)a); // first pair
233		65534	den01 = _mm_load_ps((float*)b); // first pair
234		65534	num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
235		65534	a += 2;
236		65534	b += 2;
237
238		65534	num23 = _mm_load_ps((float*)a); // second pair
239		65534	den23 = _mm_load_ps((float*)b); // second pair
240		65534	num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
241		65534	a += 2;
242		65534	b += 2;
243
244		65534	norm = _mm_magnitudesquared_ps_sse3(den01, den23);
245
246		65534	den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
247		65534	den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
248
249		65534	result = _mm_div_ps(num01, den01);
250			_mm_store_ps((float*)c, result); // Store the results back into the C container
251		65534	c += 2;
252		65534	result = _mm_div_ps(num23, den23);
253			_mm_store_ps((float*)c, result); // Store the results back into the C container
254		65534	c += 2;
255			}
256
257		2	number *= 4;
258	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
259		6	c = (a) / (*b);
260		6	a++;
261		6	b++;
262		6	c++;
263			}
264		2	}
265			#endif /* LV_HAVE_SSE */
266
267			#ifdef LV_HAVE_AVX
268			#include <immintrin.h>
269			#include <volk/volk_avx_intrinsics.h>
270
271		2	static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
272			const lv_32fc_t* numeratorVector,
273			const lv_32fc_t* denumeratorVector,
274			unsigned int num_points)
275			{
276			/*
277			* Guide to AVX intrisics:
278			* https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
279			*
280			* we'll do the "classical"
281			* a a b*
282			* --- = -------
283			* b \|b\|^2
284			*
285			*/
286		2	lv_32fc_t* c = cVector;
287		2	const lv_32fc_t* a = numeratorVector;
288		2	const lv_32fc_t* b = denumeratorVector;
289
290		2	const unsigned int eigthPoints = num_points / 8;
291
292			__m256 num01, num23, denum01, denum23, complex_result, result0, result1;
293
294	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (unsigned int number = 0; number < eigthPoints; number++) {
295			// Load the ar + ai, br + bi ... as ar,ai,br,bi ...
296		32766	num01 = _mm256_load_ps((float*)a);
297		32766	denum01 = _mm256_load_ps((float*)b);
298
299		32766	num01 = _mm256_complexconjugatemul_ps(num01, denum01);
300		32766	a += 4;
301		32766	b += 4;
302
303		32766	num23 = _mm256_load_ps((float*)a);
304		32766	denum23 = _mm256_load_ps((float*)b);
305		32766	num23 = _mm256_complexconjugatemul_ps(num23, denum23);
306		32766	a += 4;
307		32766	b += 4;
308
309		65532	complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
310			_mm256_mul_ps(denum23, denum23));
311
312		32766	denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
313		32766	denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
314
315		32766	result0 = _mm256_div_ps(num01, denum01);
316		32766	result1 = _mm256_div_ps(num23, denum23);
317
318			_mm256_store_ps((float*)c, result0);
319		32766	c += 4;
320			_mm256_store_ps((float*)c, result1);
321		32766	c += 4;
322			}
323
324		2	volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
325		2	}
326			#endif /* LV_HAVE_AVX */
327
328
329			#ifdef LV_HAVE_NEON
330			#include <arm_neon.h>
331
332			static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
333			const lv_32fc_t* aVector,
334			const lv_32fc_t* bVector,
335			unsigned int num_points)
336			{
337			lv_32fc_t* cPtr = cVector;
338			const lv_32fc_t* aPtr = aVector;
339			const lv_32fc_t* bPtr = bVector;
340
341			float32x4x2_t aVal, bVal, cVal;
342			float32x4_t bAbs, bAbsInv;
343
344			const unsigned int quarterPoints = num_points / 4;
345			unsigned int number = 0;
346			for (; number < quarterPoints; number++) {
347			aVal = vld2q_f32((const float*)(aPtr));
348			bVal = vld2q_f32((const float*)(bPtr));
349			aPtr += 4;
350			bPtr += 4;
351			__VOLK_PREFETCH(aPtr + 4);
352			__VOLK_PREFETCH(bPtr + 4);
353
354			bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
355			bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
356
357			bAbsInv = vrecpeq_f32(bAbs);
358			bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
359			bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
360
361			cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
362			cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
363			cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
364
365			cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
366			cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
367			cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
368
369			vst2q_f32((float*)(cPtr), cVal);
370			cPtr += 4;
371			}
372
373			for (number = quarterPoints * 4; number < num_points; number++) {
374			cPtr++ = (aPtr++) / (*bPtr++);
375			}
376			}
377			#endif /* LV_HAVE_NEON */
378
379
380			#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
381