GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_max_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 70 108 64.8%
Functions: 5 7 71.4%
Branches: 22 34 64.7%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_max_32f
12 *
13 * \b Overview
14 *
15 * Selects maximum value from each entry between bVector and aVector
16 * and store their results in the cVector.
17 *
18 * c[i] = max(a[i], b[i])
19 *
20 * <b>Dispatcher Prototype</b>
21 * \code
22 * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector,
23 * unsigned int num_points) \endcode
24 *
25 * \b Inputs
26 * \li aVector: First input vector.
27 * \li bVector: Second input vector.
28 * \li num_points: The number of values in both input vectors.
29 *
30 * \b Outputs
31 * \li cVector: The output vector.
32 *
33 * \b Example
34 * \code
35 * int N = 10;
36 * unsigned int alignment = volk_get_alignment();
37 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
38 * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
39 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
40 *
41 * for(unsigned int ii = 0; ii < N; ++ii){
42 * increasing[ii] = (float)ii;
43 * decreasing[ii] = 10.f - (float)ii;
44 * }
45 *
46 * volk_32f_x2_max_32f(out, increasing, decreasing, N);
47 *
48 * for(unsigned int ii = 0; ii < N; ++ii){
49 * printf("out[%u] = %1.2f\n", ii, out[ii]);
50 * }
51 *
52 * volk_free(increasing);
53 * volk_free(decreasing);
54 * volk_free(out);
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_x2_max_32f_a_H
59 #define INCLUDED_volk_32f_x2_max_32f_a_H
60
61 #include <inttypes.h>
62 #include <stdio.h>
63
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
66
67 static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector,
68 const float* aVector,
69 const float* bVector,
70 unsigned int num_points)
71 {
72 unsigned int number = 0;
73 const unsigned int sixteenthPoints = num_points / 16;
74
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
78
79 __m512 aVal, bVal, cVal;
80 for (; number < sixteenthPoints; number++) {
81 aVal = _mm512_load_ps(aPtr);
82 bVal = _mm512_load_ps(bPtr);
83
84 cVal = _mm512_max_ps(aVal, bVal);
85
86 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
87
88 aPtr += 16;
89 bPtr += 16;
90 cPtr += 16;
91 }
92
93 number = sixteenthPoints * 16;
94 for (; number < num_points; number++) {
95 const float a = *aPtr++;
96 const float b = *bPtr++;
97 *cPtr++ = (a > b ? a : b);
98 }
99 }
100 #endif /* LV_HAVE_AVX512F */
101
102 #ifdef LV_HAVE_SSE
103 #include <xmmintrin.h>
104
105 2 static inline void volk_32f_x2_max_32f_a_sse(float* cVector,
106 const float* aVector,
107 const float* bVector,
108 unsigned int num_points)
109 {
110 2 unsigned int number = 0;
111 2 const unsigned int quarterPoints = num_points / 4;
112
113 2 float* cPtr = cVector;
114 2 const float* aPtr = aVector;
115 2 const float* bPtr = bVector;
116
117 __m128 aVal, bVal, cVal;
118
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
119 65534 aVal = _mm_load_ps(aPtr);
120 65534 bVal = _mm_load_ps(bPtr);
121
122 65534 cVal = _mm_max_ps(aVal, bVal);
123
124 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
125
126 65534 aPtr += 4;
127 65534 bPtr += 4;
128 65534 cPtr += 4;
129 }
130
131 2 number = quarterPoints * 4;
132
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
133 6 const float a = *aPtr++;
134 6 const float b = *bPtr++;
135
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.
6 *cPtr++ = (a > b ? a : b);
136 }
137 2 }
138 #endif /* LV_HAVE_SSE */
139
140 #ifdef LV_HAVE_AVX
141 #include <immintrin.h>
142
143 2 static inline void volk_32f_x2_max_32f_a_avx(float* cVector,
144 const float* aVector,
145 const float* bVector,
146 unsigned int num_points)
147 {
148 2 unsigned int number = 0;
149 2 const unsigned int eighthPoints = num_points / 8;
150
151 2 float* cPtr = cVector;
152 2 const float* aPtr = aVector;
153 2 const float* bPtr = bVector;
154
155 __m256 aVal, bVal, cVal;
156
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
157 32766 aVal = _mm256_load_ps(aPtr);
158 32766 bVal = _mm256_load_ps(bPtr);
159
160 32766 cVal = _mm256_max_ps(aVal, bVal);
161
162 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
163
164 32766 aPtr += 8;
165 32766 bPtr += 8;
166 32766 cPtr += 8;
167 }
168
169 2 number = eighthPoints * 8;
170
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
171 14 const float a = *aPtr++;
172 14 const float b = *bPtr++;
173
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 5 times.
14 *cPtr++ = (a > b ? a : b);
174 }
175 2 }
176 #endif /* LV_HAVE_AVX */
177
178 #ifdef LV_HAVE_NEON
179 #include <arm_neon.h>
180
181 static inline void volk_32f_x2_max_32f_neon(float* cVector,
182 const float* aVector,
183 const float* bVector,
184 unsigned int num_points)
185 {
186 unsigned int quarter_points = num_points / 4;
187 float* cPtr = cVector;
188 const float* aPtr = aVector;
189 const float* bPtr = bVector;
190 unsigned int number = 0;
191
192 float32x4_t a_vec, b_vec, c_vec;
193 for (number = 0; number < quarter_points; number++) {
194 a_vec = vld1q_f32(aPtr);
195 b_vec = vld1q_f32(bPtr);
196 c_vec = vmaxq_f32(a_vec, b_vec);
197 vst1q_f32(cPtr, c_vec);
198 aPtr += 4;
199 bPtr += 4;
200 cPtr += 4;
201 }
202
203 for (number = quarter_points * 4; number < num_points; number++) {
204 const float a = *aPtr++;
205 const float b = *bPtr++;
206 *cPtr++ = (a > b ? a : b);
207 }
208 }
209 #endif /* LV_HAVE_NEON */
210
211
212 #ifdef LV_HAVE_GENERIC
213
214 2 static inline void volk_32f_x2_max_32f_generic(float* cVector,
215 const float* aVector,
216 const float* bVector,
217 unsigned int num_points)
218 {
219 2 float* cPtr = cVector;
220 2 const float* aPtr = aVector;
221 2 const float* bPtr = bVector;
222 2 unsigned int number = 0;
223
224
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
225 262142 const float a = *aPtr++;
226 262142 const float b = *bPtr++;
227
2/2
✓ Branch 0 taken 131213 times.
✓ Branch 1 taken 130929 times.
262142 *cPtr++ = (a > b ? a : b);
228 }
229 2 }
230 #endif /* LV_HAVE_GENERIC */
231
232 #ifdef LV_HAVE_ORC
233 extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector,
234 const float* aVector,
235 const float* bVector,
236 unsigned int num_points);
237
238 2 static inline void volk_32f_x2_max_32f_u_orc(float* cVector,
239 const float* aVector,
240 const float* bVector,
241 unsigned int num_points)
242 {
243 2 volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
244 2 }
245 #endif /* LV_HAVE_ORC */
246
247
248 #endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
249
250
251 #ifndef INCLUDED_volk_32f_x2_max_32f_u_H
252 #define INCLUDED_volk_32f_x2_max_32f_u_H
253
254 #include <inttypes.h>
255 #include <stdio.h>
256
257 #ifdef LV_HAVE_AVX512F
258 #include <immintrin.h>
259
260 static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector,
261 const float* aVector,
262 const float* bVector,
263 unsigned int num_points)
264 {
265 unsigned int number = 0;
266 const unsigned int sixteenthPoints = num_points / 16;
267
268 float* cPtr = cVector;
269 const float* aPtr = aVector;
270 const float* bPtr = bVector;
271
272 __m512 aVal, bVal, cVal;
273 for (; number < sixteenthPoints; number++) {
274 aVal = _mm512_loadu_ps(aPtr);
275 bVal = _mm512_loadu_ps(bPtr);
276
277 cVal = _mm512_max_ps(aVal, bVal);
278
279 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
280
281 aPtr += 16;
282 bPtr += 16;
283 cPtr += 16;
284 }
285
286 number = sixteenthPoints * 16;
287 for (; number < num_points; number++) {
288 const float a = *aPtr++;
289 const float b = *bPtr++;
290 *cPtr++ = (a > b ? a : b);
291 }
292 }
293 #endif /* LV_HAVE_AVX512F */
294
295 #ifdef LV_HAVE_AVX
296 #include <immintrin.h>
297
298 2 static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
299 const float* aVector,
300 const float* bVector,
301 unsigned int num_points)
302 {
303 2 unsigned int number = 0;
304 2 const unsigned int eighthPoints = num_points / 8;
305
306 2 float* cPtr = cVector;
307 2 const float* aPtr = aVector;
308 2 const float* bPtr = bVector;
309
310 __m256 aVal, bVal, cVal;
311
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
312 32766 aVal = _mm256_loadu_ps(aPtr);
313 32766 bVal = _mm256_loadu_ps(bPtr);
314
315 32766 cVal = _mm256_max_ps(aVal, bVal);
316
317 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
318
319 32766 aPtr += 8;
320 32766 bPtr += 8;
321 32766 cPtr += 8;
322 }
323
324 2 number = eighthPoints * 8;
325
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
326 14 const float a = *aPtr++;
327 14 const float b = *bPtr++;
328
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 5 times.
14 *cPtr++ = (a > b ? a : b);
329 }
330 2 }
331 #endif /* LV_HAVE_AVX */
332
333 #endif /* INCLUDED_volk_32f_x2_max_32f_u_H */
334