GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_32f_x2_divide_32f.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	62	96	64.6%
Functions:	5	7	71.4%
Branches:	14	22	63.6%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_32f_x2_divide_32f
    
       *
    
       * \b Overview
    
       *
    
       * Divides aVector by bVector to produce cVector:
    
       *
    
       * c[i] = a[i] / b[i]
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
    
       * unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li aVector: First vector of input points.
    
       * \li bVector: Second vector of input points.
    
       * \li num_points: The number of values in both input vector.
    
       *
    
       * \b Outputs
    
       * \li cVector: The output vector.
    
       *
    
       * \b Example
    
       * Divide an increasing vector by a decreasing vector
    
       * \code
    
       *   int N = 10;
    
       *   unsigned int alignment = volk_get_alignment();
    
       *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       increasing[ii] = (float)ii;
    
       *       decreasing[ii] = 10.f - (float)ii;
    
       *   }
    
       *
    
       *   volk_32f_x2_divide_32f(out, increasing, decreasing, N);
    
       *
    
       *   for(unsigned int ii = 0; ii < N; ++ii){
    
       *       printf("out[%u] = %1.2f\n", ii, out[ii]);
    
       *   }
    
       *
    
       *   volk_free(increasing);
    
       *   volk_free(decreasing);
    
       *   volk_free(out);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
    
      #define INCLUDED_volk_32f_x2_divide_32f_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX512F
    
      #include <immintrin.h>
    
      ✗
      static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
    
                                                          const float* aVector,
    
                                                          const float* bVector,
    
                                                          unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int sixteenthPoints = num_points / 16;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
      ✗
          const float* bPtr = bVector;
    
          __m512 aVal, bVal, cVal;
    
      ✗
          for (; number < sixteenthPoints; number++) {
    
      ✗
              aVal = _mm512_load_ps(aPtr);
    
      ✗
              bVal = _mm512_load_ps(bPtr);
    
      ✗
              cVal = _mm512_div_ps(aVal, bVal);
    
              _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
    
      ✗
              aPtr += 16;
    
      ✗
              bPtr += 16;
    
      ✗
              cPtr += 16;
    
          }
    
      ✗
          number = sixteenthPoints * 16;
    
      ✗
          for (; number < num_points; number++) {
    
      ✗
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      ✗
      }
    
      #endif /* LV_HAVE_AVX512F */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
    
                                                      const float* aVector,
    
                                                      const float* bVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m256 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_load_ps(aPtr);
    
      32766
              bVal = _mm256_load_ps(bPtr);
    
      32766
              cVal = _mm256_div_ps(aVal, bVal);
    
              _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #ifdef LV_HAVE_SSE
    
      #include <xmmintrin.h>
    
      2
      static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
    
                                                      const float* aVector,
    
                                                      const float* bVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int quarterPoints = num_points / 4;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m128 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (; number < quarterPoints; number++) {
    
      65534
              aVal = _mm_load_ps(aPtr);
    
      65534
              bVal = _mm_load_ps(bPtr);
    
      65534
              cVal = _mm_div_ps(aVal, bVal);
    
              _mm_store_ps(cPtr, cVal); // Store the results back into the C container
    
      65534
              aPtr += 4;
    
      65534
              bPtr += 4;
    
      65534
              cPtr += 4;
    
          }
    
      2
          number = quarterPoints * 4;
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (; number < num_points; number++) {
    
      6
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_32f_x2_divide_32f_neon(float* cVector,
    
                                                     const float* aVector,
    
                                                     const float* bVector,
    
                                                     unsigned int num_points)
    
      {
    
          float* cPtr = cVector;
    
          const float* aPtr = aVector;
    
          const float* bPtr = bVector;
    
          float32x4x4_t aVal, bVal, bInv, cVal;
    
          const unsigned int eighthPoints = num_points / 16;
    
          unsigned int number = 0;
    
          for (; number < eighthPoints; number++) {
    
              aVal = vld4q_f32(aPtr);
    
              aPtr += 16;
    
              bVal = vld4q_f32(bPtr);
    
              bPtr += 16;
    
              __VOLK_PREFETCH(aPtr + 16);
    
              __VOLK_PREFETCH(bPtr + 16);
    
              bInv.val[0] = vrecpeq_f32(bVal.val[0]);
    
              bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
    
              bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
    
              cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
    
              bInv.val[1] = vrecpeq_f32(bVal.val[1]);
    
              bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
    
              bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
    
              cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
    
              bInv.val[2] = vrecpeq_f32(bVal.val[2]);
    
              bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
    
              bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
    
              cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
    
              bInv.val[3] = vrecpeq_f32(bVal.val[3]);
    
              bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
    
              bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
    
              cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
    
              vst4q_f32(cPtr, cVal);
    
              cPtr += 16;
    
          }
    
          for (number = eighthPoints * 16; number < num_points; number++) {
    
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_32f_x2_divide_32f_generic(float* cVector,
    
                                                        const float* aVector,
    
                                                        const float* bVector,
    
                                                        unsigned int num_points)
    
      {
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
      2
          unsigned int number = 0;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (number = 0; number < num_points; number++) {
    
      262142
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_GENERIC */
    
      #ifdef LV_HAVE_ORC
    
      extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
    
                                                    const float* aVector,
    
                                                    const float* bVector,
    
                                                    unsigned int num_points);
    
      2
      static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
    
                                                      const float* aVector,
    
                                                      const float* bVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
    
      2
      }
    
      #endif /* LV_HAVE_ORC */
    
      #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
    
      #ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
    
      #define INCLUDED_volk_32f_x2_divide_32f_u_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_AVX512F
    
      #include <immintrin.h>
    
      ✗
      static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
    
                                                          const float* aVector,
    
                                                          const float* bVector,
    
                                                          unsigned int num_points)
    
      {
    
      ✗
          unsigned int number = 0;
    
      ✗
          const unsigned int sixteenthPoints = num_points / 16;
    
      ✗
          float* cPtr = cVector;
    
      ✗
          const float* aPtr = aVector;
    
      ✗
          const float* bPtr = bVector;
    
          __m512 aVal, bVal, cVal;
    
      ✗
          for (; number < sixteenthPoints; number++) {
    
      ✗
              aVal = _mm512_loadu_ps(aPtr);
    
      ✗
              bVal = _mm512_loadu_ps(bPtr);
    
      ✗
              cVal = _mm512_div_ps(aVal, bVal);
    
              _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      ✗
              aPtr += 16;
    
      ✗
              bPtr += 16;
    
      ✗
              cPtr += 16;
    
          }
    
      ✗
          number = sixteenthPoints * 16;
    
      ✗
          for (; number < num_points; number++) {
    
      ✗
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      ✗
      }
    
      #endif /* LV_HAVE_AVX512F */
    
      #ifdef LV_HAVE_AVX
    
      #include <immintrin.h>
    
      2
      static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
    
                                                      const float* aVector,
    
                                                      const float* bVector,
    
                                                      unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int eighthPoints = num_points / 8;
    
      2
          float* cPtr = cVector;
    
      2
          const float* aPtr = aVector;
    
      2
          const float* bPtr = bVector;
    
          __m256 aVal, bVal, cVal;
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < eighthPoints; number++) {
    
      32766
              aVal = _mm256_loadu_ps(aPtr);
    
      32766
              bVal = _mm256_loadu_ps(bPtr);
    
      32766
              cVal = _mm256_div_ps(aVal, bVal);
    
              _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
    
      32766
              aPtr += 8;
    
      32766
              bPtr += 8;
    
      32766
              cPtr += 8;
    
          }
    
      2
          number = eighthPoints * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *cPtr++ = (*aPtr++) / (*bPtr++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX */
    
      #endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2012, 2014 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_32f_x2_divide_32f
12			*
13			* \b Overview
14			*
15			* Divides aVector by bVector to produce cVector:
16			*
17			* c[i] = a[i] / b[i]
18			*
19			* <b>Dispatcher Prototype</b>
20			* \code
21			* void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
22			* unsigned int num_points) \endcode
23			*
24			* \b Inputs
25			* \li aVector: First vector of input points.
26			* \li bVector: Second vector of input points.
27			* \li num_points: The number of values in both input vector.
28			*
29			* \b Outputs
30			* \li cVector: The output vector.
31			*
32			* \b Example
33			* Divide an increasing vector by a decreasing vector
34			* \code
35			* int N = 10;
36			* unsigned int alignment = volk_get_alignment();
37			* float* increasing = (float)volk_malloc(sizeof(float)N, alignment);
38			* float* decreasing = (float)volk_malloc(sizeof(float)N, alignment);
39			* float* out = (float)volk_malloc(sizeof(float)N, alignment);
40			*
41			* for(unsigned int ii = 0; ii < N; ++ii){
42			* increasing[ii] = (float)ii;
43			* decreasing[ii] = 10.f - (float)ii;
44			* }
45			*
46			* volk_32f_x2_divide_32f(out, increasing, decreasing, N);
47			*
48			* for(unsigned int ii = 0; ii < N; ++ii){
49			* printf("out[%u] = %1.2f\n", ii, out[ii]);
50			* }
51			*
52			* volk_free(increasing);
53			* volk_free(decreasing);
54			* volk_free(out);
55			* \endcode
56			*/
57
58			#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
59			#define INCLUDED_volk_32f_x2_divide_32f_a_H
60
61			#include <inttypes.h>
62			#include <stdio.h>
63
64			#ifdef LV_HAVE_AVX512F
65			#include <immintrin.h>
66
67		✗	static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
68			const float* aVector,
69			const float* bVector,
70			unsigned int num_points)
71			{
72		✗	unsigned int number = 0;
73		✗	const unsigned int sixteenthPoints = num_points / 16;
74
75		✗	float* cPtr = cVector;
76		✗	const float* aPtr = aVector;
77		✗	const float* bPtr = bVector;
78
79			__m512 aVal, bVal, cVal;
80		✗	for (; number < sixteenthPoints; number++) {
81		✗	aVal = _mm512_load_ps(aPtr);
82		✗	bVal = _mm512_load_ps(bPtr);
83
84		✗	cVal = _mm512_div_ps(aVal, bVal);
85
86			_mm512_store_ps(cPtr, cVal); // Store the results back into the C container
87
88		✗	aPtr += 16;
89		✗	bPtr += 16;
90		✗	cPtr += 16;
91			}
92
93		✗	number = sixteenthPoints * 16;
94		✗	for (; number < num_points; number++) {
95		✗	cPtr++ = (aPtr++) / (*bPtr++);
96			}
97		✗	}
98			#endif /* LV_HAVE_AVX512F */
99
100
101			#ifdef LV_HAVE_AVX
102			#include <immintrin.h>
103
104		2	static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
105			const float* aVector,
106			const float* bVector,
107			unsigned int num_points)
108			{
109		2	unsigned int number = 0;
110		2	const unsigned int eighthPoints = num_points / 8;
111
112		2	float* cPtr = cVector;
113		2	const float* aPtr = aVector;
114		2	const float* bPtr = bVector;
115
116			__m256 aVal, bVal, cVal;
117	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
118		32766	aVal = _mm256_load_ps(aPtr);
119		32766	bVal = _mm256_load_ps(bPtr);
120
121		32766	cVal = _mm256_div_ps(aVal, bVal);
122
123			_mm256_store_ps(cPtr, cVal); // Store the results back into the C container
124
125		32766	aPtr += 8;
126		32766	bPtr += 8;
127		32766	cPtr += 8;
128			}
129
130		2	number = eighthPoints * 8;
131	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
132		14	cPtr++ = (aPtr++) / (*bPtr++);
133			}
134		2	}
135			#endif /* LV_HAVE_AVX */
136
137
138			#ifdef LV_HAVE_SSE
139			#include <xmmintrin.h>
140
141		2	static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
142			const float* aVector,
143			const float* bVector,
144			unsigned int num_points)
145			{
146		2	unsigned int number = 0;
147		2	const unsigned int quarterPoints = num_points / 4;
148
149		2	float* cPtr = cVector;
150		2	const float* aPtr = aVector;
151		2	const float* bPtr = bVector;
152
153			__m128 aVal, bVal, cVal;
154	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (; number < quarterPoints; number++) {
155		65534	aVal = _mm_load_ps(aPtr);
156		65534	bVal = _mm_load_ps(bPtr);
157
158		65534	cVal = _mm_div_ps(aVal, bVal);
159
160			_mm_store_ps(cPtr, cVal); // Store the results back into the C container
161
162		65534	aPtr += 4;
163		65534	bPtr += 4;
164		65534	cPtr += 4;
165			}
166
167		2	number = quarterPoints * 4;
168	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (; number < num_points; number++) {
169		6	cPtr++ = (aPtr++) / (*bPtr++);
170			}
171		2	}
172			#endif /* LV_HAVE_SSE */
173
174
175			#ifdef LV_HAVE_NEON
176			#include <arm_neon.h>
177
178			static inline void volk_32f_x2_divide_32f_neon(float* cVector,
179			const float* aVector,
180			const float* bVector,
181			unsigned int num_points)
182			{
183			float* cPtr = cVector;
184			const float* aPtr = aVector;
185			const float* bPtr = bVector;
186
187			float32x4x4_t aVal, bVal, bInv, cVal;
188
189			const unsigned int eighthPoints = num_points / 16;
190			unsigned int number = 0;
191			for (; number < eighthPoints; number++) {
192			aVal = vld4q_f32(aPtr);
193			aPtr += 16;
194			bVal = vld4q_f32(bPtr);
195			bPtr += 16;
196
197			__VOLK_PREFETCH(aPtr + 16);
198			__VOLK_PREFETCH(bPtr + 16);
199
200			bInv.val[0] = vrecpeq_f32(bVal.val[0]);
201			bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
202			bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
203			cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
204
205			bInv.val[1] = vrecpeq_f32(bVal.val[1]);
206			bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
207			bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
208			cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
209
210			bInv.val[2] = vrecpeq_f32(bVal.val[2]);
211			bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
212			bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
213			cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
214
215			bInv.val[3] = vrecpeq_f32(bVal.val[3]);
216			bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
217			bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
218			cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
219
220			vst4q_f32(cPtr, cVal);
221			cPtr += 16;
222			}
223
224			for (number = eighthPoints * 16; number < num_points; number++) {
225			cPtr++ = (aPtr++) / (*bPtr++);
226			}
227			}
228
229			#endif /* LV_HAVE_NEON */
230
231
232			#ifdef LV_HAVE_GENERIC
233
234		2	static inline void volk_32f_x2_divide_32f_generic(float* cVector,
235			const float* aVector,
236			const float* bVector,
237			unsigned int num_points)
238			{
239		2	float* cPtr = cVector;
240		2	const float* aPtr = aVector;
241		2	const float* bPtr = bVector;
242		2	unsigned int number = 0;
243
244	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (number = 0; number < num_points; number++) {
245		262142	cPtr++ = (aPtr++) / (*bPtr++);
246			}
247		2	}
248			#endif /* LV_HAVE_GENERIC */
249
250
251			#ifdef LV_HAVE_ORC
252
253			extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
254			const float* aVector,
255			const float* bVector,
256			unsigned int num_points);
257
258		2	static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
259			const float* aVector,
260			const float* bVector,
261			unsigned int num_points)
262			{
263		2	volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
264		2	}
265			#endif /* LV_HAVE_ORC */
266
267
268			#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
269
270
271			#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
272			#define INCLUDED_volk_32f_x2_divide_32f_u_H
273
274			#include <inttypes.h>
275			#include <stdio.h>
276
277			#ifdef LV_HAVE_AVX512F
278			#include <immintrin.h>
279
280		✗	static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
281			const float* aVector,
282			const float* bVector,
283			unsigned int num_points)
284			{
285		✗	unsigned int number = 0;
286		✗	const unsigned int sixteenthPoints = num_points / 16;
287
288		✗	float* cPtr = cVector;
289		✗	const float* aPtr = aVector;
290		✗	const float* bPtr = bVector;
291
292			__m512 aVal, bVal, cVal;
293		✗	for (; number < sixteenthPoints; number++) {
294		✗	aVal = _mm512_loadu_ps(aPtr);
295		✗	bVal = _mm512_loadu_ps(bPtr);
296
297		✗	cVal = _mm512_div_ps(aVal, bVal);
298
299			_mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
300
301		✗	aPtr += 16;
302		✗	bPtr += 16;
303		✗	cPtr += 16;
304			}
305
306		✗	number = sixteenthPoints * 16;
307		✗	for (; number < num_points; number++) {
308		✗	cPtr++ = (aPtr++) / (*bPtr++);
309			}
310		✗	}
311			#endif /* LV_HAVE_AVX512F */
312
313
314			#ifdef LV_HAVE_AVX
315			#include <immintrin.h>
316
317		2	static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
318			const float* aVector,
319			const float* bVector,
320			unsigned int num_points)
321			{
322		2	unsigned int number = 0;
323		2	const unsigned int eighthPoints = num_points / 8;
324
325		2	float* cPtr = cVector;
326		2	const float* aPtr = aVector;
327		2	const float* bPtr = bVector;
328
329			__m256 aVal, bVal, cVal;
330	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < eighthPoints; number++) {
331		32766	aVal = _mm256_loadu_ps(aPtr);
332		32766	bVal = _mm256_loadu_ps(bPtr);
333
334		32766	cVal = _mm256_div_ps(aVal, bVal);
335
336			_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
337
338		32766	aPtr += 8;
339		32766	bPtr += 8;
340		32766	cPtr += 8;
341			}
342
343		2	number = eighthPoints * 8;
344	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
345		14	cPtr++ = (aPtr++) / (*bPtr++);
346			}
347		2	}
348			#endif /* LV_HAVE_AVX */
349
350			#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
351