GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_divide_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 62 96 64.6%
Functions: 5 7 71.4%
Branches: 14 22 63.6%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_divide_32f
12 *
13 * \b Overview
14 *
15 * Divides aVector by bVector to produce cVector:
16 *
17 * c[i] = a[i] / b[i]
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
22 * unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li aVector: First vector of input points.
26 * \li bVector: Second vector of input points.
27 * \li num_points: The number of values in both input vector.
28 *
29 * \b Outputs
30 * \li cVector: The output vector.
31 *
32 * \b Example
33 * Divide an increasing vector by a decreasing vector
34 * \code
35 * int N = 10;
36 * unsigned int alignment = volk_get_alignment();
37 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
38 * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
39 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
40 *
41 * for(unsigned int ii = 0; ii < N; ++ii){
42 * increasing[ii] = (float)ii;
43 * decreasing[ii] = 10.f - (float)ii;
44 * }
45 *
46 * volk_32f_x2_divide_32f(out, increasing, decreasing, N);
47 *
48 * for(unsigned int ii = 0; ii < N; ++ii){
49 * printf("out[%u] = %1.2f\n", ii, out[ii]);
50 * }
51 *
52 * volk_free(increasing);
53 * volk_free(decreasing);
54 * volk_free(out);
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
59 #define INCLUDED_volk_32f_x2_divide_32f_a_H
60
61 #include <inttypes.h>
62 #include <stdio.h>
63
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
66
67 static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
68 const float* aVector,
69 const float* bVector,
70 unsigned int num_points)
71 {
72 unsigned int number = 0;
73 const unsigned int sixteenthPoints = num_points / 16;
74
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
78
79 __m512 aVal, bVal, cVal;
80 for (; number < sixteenthPoints; number++) {
81 aVal = _mm512_load_ps(aPtr);
82 bVal = _mm512_load_ps(bPtr);
83
84 cVal = _mm512_div_ps(aVal, bVal);
85
86 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
87
88 aPtr += 16;
89 bPtr += 16;
90 cPtr += 16;
91 }
92
93 number = sixteenthPoints * 16;
94 for (; number < num_points; number++) {
95 *cPtr++ = (*aPtr++) / (*bPtr++);
96 }
97 }
98 #endif /* LV_HAVE_AVX512F */
99
100
101 #ifdef LV_HAVE_AVX
102 #include <immintrin.h>
103
104 2 static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
108 {
109 2 unsigned int number = 0;
110 2 const unsigned int eighthPoints = num_points / 8;
111
112 2 float* cPtr = cVector;
113 2 const float* aPtr = aVector;
114 2 const float* bPtr = bVector;
115
116 __m256 aVal, bVal, cVal;
117
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
118 32766 aVal = _mm256_load_ps(aPtr);
119 32766 bVal = _mm256_load_ps(bPtr);
120
121 32766 cVal = _mm256_div_ps(aVal, bVal);
122
123 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
124
125 32766 aPtr += 8;
126 32766 bPtr += 8;
127 32766 cPtr += 8;
128 }
129
130 2 number = eighthPoints * 8;
131
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
132 14 *cPtr++ = (*aPtr++) / (*bPtr++);
133 }
134 2 }
135 #endif /* LV_HAVE_AVX */
136
137
138 #ifdef LV_HAVE_SSE
139 #include <xmmintrin.h>
140
141 2 static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
145 {
146 2 unsigned int number = 0;
147 2 const unsigned int quarterPoints = num_points / 4;
148
149 2 float* cPtr = cVector;
150 2 const float* aPtr = aVector;
151 2 const float* bPtr = bVector;
152
153 __m128 aVal, bVal, cVal;
154
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
155 65534 aVal = _mm_load_ps(aPtr);
156 65534 bVal = _mm_load_ps(bPtr);
157
158 65534 cVal = _mm_div_ps(aVal, bVal);
159
160 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
161
162 65534 aPtr += 4;
163 65534 bPtr += 4;
164 65534 cPtr += 4;
165 }
166
167 2 number = quarterPoints * 4;
168
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
169 6 *cPtr++ = (*aPtr++) / (*bPtr++);
170 }
171 2 }
172 #endif /* LV_HAVE_SSE */
173
174
175 #ifdef LV_HAVE_NEON
176 #include <arm_neon.h>
177
178 static inline void volk_32f_x2_divide_32f_neon(float* cVector,
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
182 {
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
186
187 float32x4x4_t aVal, bVal, bInv, cVal;
188
189 const unsigned int eighthPoints = num_points / 16;
190 unsigned int number = 0;
191 for (; number < eighthPoints; number++) {
192 aVal = vld4q_f32(aPtr);
193 aPtr += 16;
194 bVal = vld4q_f32(bPtr);
195 bPtr += 16;
196
197 __VOLK_PREFETCH(aPtr + 16);
198 __VOLK_PREFETCH(bPtr + 16);
199
200 bInv.val[0] = vrecpeq_f32(bVal.val[0]);
201 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
202 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
203 cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
204
205 bInv.val[1] = vrecpeq_f32(bVal.val[1]);
206 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
207 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
208 cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
209
210 bInv.val[2] = vrecpeq_f32(bVal.val[2]);
211 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
212 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
213 cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
214
215 bInv.val[3] = vrecpeq_f32(bVal.val[3]);
216 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
217 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
218 cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
219
220 vst4q_f32(cPtr, cVal);
221 cPtr += 16;
222 }
223
224 for (number = eighthPoints * 16; number < num_points; number++) {
225 *cPtr++ = (*aPtr++) / (*bPtr++);
226 }
227 }
228
229 #endif /* LV_HAVE_NEON */
230
231
232 #ifdef LV_HAVE_GENERIC
233
234 2 static inline void volk_32f_x2_divide_32f_generic(float* cVector,
235 const float* aVector,
236 const float* bVector,
237 unsigned int num_points)
238 {
239 2 float* cPtr = cVector;
240 2 const float* aPtr = aVector;
241 2 const float* bPtr = bVector;
242 2 unsigned int number = 0;
243
244
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
245 262142 *cPtr++ = (*aPtr++) / (*bPtr++);
246 }
247 2 }
248 #endif /* LV_HAVE_GENERIC */
249
250
251 #ifdef LV_HAVE_ORC
252
253 extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
254 const float* aVector,
255 const float* bVector,
256 unsigned int num_points);
257
258 2 static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
259 const float* aVector,
260 const float* bVector,
261 unsigned int num_points)
262 {
263 2 volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
264 2 }
265 #endif /* LV_HAVE_ORC */
266
267
268 #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
269
270
271 #ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
272 #define INCLUDED_volk_32f_x2_divide_32f_u_H
273
274 #include <inttypes.h>
275 #include <stdio.h>
276
277 #ifdef LV_HAVE_AVX512F
278 #include <immintrin.h>
279
280 static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
281 const float* aVector,
282 const float* bVector,
283 unsigned int num_points)
284 {
285 unsigned int number = 0;
286 const unsigned int sixteenthPoints = num_points / 16;
287
288 float* cPtr = cVector;
289 const float* aPtr = aVector;
290 const float* bPtr = bVector;
291
292 __m512 aVal, bVal, cVal;
293 for (; number < sixteenthPoints; number++) {
294 aVal = _mm512_loadu_ps(aPtr);
295 bVal = _mm512_loadu_ps(bPtr);
296
297 cVal = _mm512_div_ps(aVal, bVal);
298
299 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
300
301 aPtr += 16;
302 bPtr += 16;
303 cPtr += 16;
304 }
305
306 number = sixteenthPoints * 16;
307 for (; number < num_points; number++) {
308 *cPtr++ = (*aPtr++) / (*bPtr++);
309 }
310 }
311 #endif /* LV_HAVE_AVX512F */
312
313
314 #ifdef LV_HAVE_AVX
315 #include <immintrin.h>
316
317 2 static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
318 const float* aVector,
319 const float* bVector,
320 unsigned int num_points)
321 {
322 2 unsigned int number = 0;
323 2 const unsigned int eighthPoints = num_points / 8;
324
325 2 float* cPtr = cVector;
326 2 const float* aPtr = aVector;
327 2 const float* bPtr = bVector;
328
329 __m256 aVal, bVal, cVal;
330
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
331 32766 aVal = _mm256_loadu_ps(aPtr);
332 32766 bVal = _mm256_loadu_ps(bPtr);
333
334 32766 cVal = _mm256_div_ps(aVal, bVal);
335
336 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
337
338 32766 aPtr += 8;
339 32766 bPtr += 8;
340 32766 cPtr += 8;
341 }
342
343 2 number = eighthPoints * 8;
344
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
345 14 *cPtr++ = (*aPtr++) / (*bPtr++);
346 }
347 2 }
348 #endif /* LV_HAVE_AVX */
349
350 #endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
351