GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_add_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 87 121 71.9%
Functions: 7 9 77.8%
Branches: 20 28 71.4%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_add_32f
12 *
13 * \b Overview
14 *
15 * Adds two vectors together element by element:
16 *
17 * c[i] = a[i] + b[i]
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
22 * unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li aVector: First vector of input points.
26 * \li bVector: Second vector of input points.
27 * \li num_points: The number of values in both input vector.
28 *
29 * \b Outputs
30 * \li cVector: The output vector.
31 *
32 * \b Example
33 *
34 * The follow example adds the increasing and decreasing vectors such that the result of
35 * every summation pair is 10
36 *
37 * \code
38 * int N = 10;
39 * unsigned int alignment = volk_get_alignment();
40 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
41 * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
42 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
43 *
44 * for(unsigned int ii = 0; ii < N; ++ii){
45 * increasing[ii] = (float)ii;
46 * decreasing[ii] = 10.f - (float)ii;
47 * }
48 *
49 * volk_32f_x2_add_32f(out, increasing, decreasing, N);
50 *
51 * for(unsigned int ii = 0; ii < N; ++ii){
52 * printf("out[%u] = %1.2f\n", ii, out[ii]);
53 * }
54 *
55 * volk_free(increasing);
56 * volk_free(decreasing);
57 * volk_free(out);
58 * \endcode
59 */
60
61 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62 #define INCLUDED_volk_32f_x2_add_32f_u_H
63
64 #include <inttypes.h>
65 #include <stdio.h>
66
67 #ifdef LV_HAVE_AVX512F
68 #include <immintrin.h>
69
70 static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
71 const float* aVector,
72 const float* bVector,
73 unsigned int num_points)
74 {
75 unsigned int number = 0;
76 const unsigned int sixteenthPoints = num_points / 16;
77
78 float* cPtr = cVector;
79 const float* aPtr = aVector;
80 const float* bPtr = bVector;
81
82 __m512 aVal, bVal, cVal;
83 for (; number < sixteenthPoints; number++) {
84
85 aVal = _mm512_loadu_ps(aPtr);
86 bVal = _mm512_loadu_ps(bPtr);
87
88 cVal = _mm512_add_ps(aVal, bVal);
89
90 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
91
92 aPtr += 16;
93 bPtr += 16;
94 cPtr += 16;
95 }
96
97 number = sixteenthPoints * 16;
98
99 for (; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) + (*bPtr++);
101 }
102 }
103
104 #endif /* LV_HAVE_AVX512F */
105
106
107 #ifdef LV_HAVE_AVX
108 #include <immintrin.h>
109
110 2 static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
114 {
115 2 unsigned int number = 0;
116 2 const unsigned int eighthPoints = num_points / 8;
117 2 float* cPtr = cVector;
118 2 const float* aPtr = aVector;
119 2 const float* bPtr = bVector;
120 __m256 aVal, bVal, cVal;
121
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
122
123 32766 aVal = _mm256_loadu_ps(aPtr);
124 32766 bVal = _mm256_loadu_ps(bPtr);
125
126 32766 cVal = _mm256_add_ps(aVal, bVal);
127
128 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
129
130 32766 aPtr += 8;
131 32766 bPtr += 8;
132 32766 cPtr += 8;
133 }
134
135 2 number = eighthPoints * 8;
136
137
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
138 14 *cPtr++ = (*aPtr++) + (*bPtr++);
139 }
140 2 }
141 #endif /* LV_HAVE_AVX */
142
143
144 #ifdef LV_HAVE_SSE
145 #include <xmmintrin.h>
146
147 2 static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
148 const float* aVector,
149 const float* bVector,
150 unsigned int num_points)
151 {
152 2 unsigned int number = 0;
153 2 const unsigned int quarterPoints = num_points / 4;
154
155 2 float* cPtr = cVector;
156 2 const float* aPtr = aVector;
157 2 const float* bPtr = bVector;
158
159 __m128 aVal, bVal, cVal;
160
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
161
162 65534 aVal = _mm_loadu_ps(aPtr);
163 65534 bVal = _mm_loadu_ps(bPtr);
164
165 65534 cVal = _mm_add_ps(aVal, bVal);
166
167 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
168
169 65534 aPtr += 4;
170 65534 bPtr += 4;
171 65534 cPtr += 4;
172 }
173
174 2 number = quarterPoints * 4;
175
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
176 6 *cPtr++ = (*aPtr++) + (*bPtr++);
177 }
178 2 }
179 #endif /* LV_HAVE_SSE */
180
181
182 #ifdef LV_HAVE_GENERIC
183
184 2 static inline void volk_32f_x2_add_32f_generic(float* cVector,
185 const float* aVector,
186 const float* bVector,
187 unsigned int num_points)
188 {
189 2 float* cPtr = cVector;
190 2 const float* aPtr = aVector;
191 2 const float* bPtr = bVector;
192 2 unsigned int number = 0;
193
194
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
195 262142 *cPtr++ = (*aPtr++) + (*bPtr++);
196 }
197 2 }
198 #endif /* LV_HAVE_GENERIC */
199
200
201 #endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
202 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203 #define INCLUDED_volk_32f_x2_add_32f_a_H
204
205 #include <inttypes.h>
206 #include <stdio.h>
207
208 #ifdef LV_HAVE_AVX512F
209 #include <immintrin.h>
210
211 static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
215 {
216 unsigned int number = 0;
217 const unsigned int sixteenthPoints = num_points / 16;
218
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221 const float* bPtr = bVector;
222
223 __m512 aVal, bVal, cVal;
224 for (; number < sixteenthPoints; number++) {
225
226 aVal = _mm512_load_ps(aPtr);
227 bVal = _mm512_load_ps(bPtr);
228
229 cVal = _mm512_add_ps(aVal, bVal);
230
231 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
232
233 aPtr += 16;
234 bPtr += 16;
235 cPtr += 16;
236 }
237
238 number = sixteenthPoints * 16;
239
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
242 }
243 }
244
245 #endif /* LV_HAVE_AVX512F */
246
247
248 #ifdef LV_HAVE_AVX
249 #include <immintrin.h>
250
251 2 static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
255 {
256 2 unsigned int number = 0;
257 2 const unsigned int eighthPoints = num_points / 8;
258
259 2 float* cPtr = cVector;
260 2 const float* aPtr = aVector;
261 2 const float* bPtr = bVector;
262
263 __m256 aVal, bVal, cVal;
264
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
265
266 32766 aVal = _mm256_load_ps(aPtr);
267 32766 bVal = _mm256_load_ps(bPtr);
268
269 32766 cVal = _mm256_add_ps(aVal, bVal);
270
271 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
272
273 32766 aPtr += 8;
274 32766 bPtr += 8;
275 32766 cPtr += 8;
276 }
277
278 2 number = eighthPoints * 8;
279
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
280 14 *cPtr++ = (*aPtr++) + (*bPtr++);
281 }
282 2 }
283 #endif /* LV_HAVE_AVX */
284
285 #ifdef LV_HAVE_SSE
286 #include <xmmintrin.h>
287
288 2 static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
289 const float* aVector,
290 const float* bVector,
291 unsigned int num_points)
292 {
293 2 unsigned int number = 0;
294 2 const unsigned int quarterPoints = num_points / 4;
295
296 2 float* cPtr = cVector;
297 2 const float* aPtr = aVector;
298 2 const float* bPtr = bVector;
299
300 __m128 aVal, bVal, cVal;
301
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
302 65534 aVal = _mm_load_ps(aPtr);
303 65534 bVal = _mm_load_ps(bPtr);
304
305 65534 cVal = _mm_add_ps(aVal, bVal);
306
307 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
308
309 65534 aPtr += 4;
310 65534 bPtr += 4;
311 65534 cPtr += 4;
312 }
313
314 2 number = quarterPoints * 4;
315
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
316 6 *cPtr++ = (*aPtr++) + (*bPtr++);
317 }
318 2 }
319 #endif /* LV_HAVE_SSE */
320
321
322 #ifdef LV_HAVE_NEON
323 #include <arm_neon.h>
324
325 static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
326 const float* aVector,
327 const float* bVector,
328 unsigned int num_points)
329 {
330 unsigned int number = 0;
331 const unsigned int quarterPoints = num_points / 4;
332
333 float* cPtr = cVector;
334 const float* aPtr = aVector;
335 const float* bPtr = bVector;
336 float32x4_t aVal, bVal, cVal;
337 for (number = 0; number < quarterPoints; number++) {
338 // Load in to NEON registers
339 aVal = vld1q_f32(aPtr);
340 bVal = vld1q_f32(bPtr);
341 __VOLK_PREFETCH(aPtr + 4);
342 __VOLK_PREFETCH(bPtr + 4);
343
344 // vector add
345 cVal = vaddq_f32(aVal, bVal);
346 // Store the results back into the C container
347 vst1q_f32(cPtr, cVal);
348
349 aPtr += 4; // q uses quadwords, 4 floats per vadd
350 bPtr += 4;
351 cPtr += 4;
352 }
353
354 number = quarterPoints * 4; // should be = num_points
355 for (; number < num_points; number++) {
356 *cPtr++ = (*aPtr++) + (*bPtr++);
357 }
358 }
359
360 #endif /* LV_HAVE_NEON */
361
362 #ifdef LV_HAVE_NEONV7
363 extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
364 const float* aVector,
365 const float* bVector,
366 unsigned int num_points);
367 #endif /* LV_HAVE_NEONV7 */
368
369 #ifdef LV_HAVE_NEONV7
370 extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
371 const float* aVector,
372 const float* bVector,
373 unsigned int num_points);
374 #endif /* LV_HAVE_NEONV7 */
375
376 #ifdef LV_HAVE_GENERIC
377
378 2 static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
379 const float* aVector,
380 const float* bVector,
381 unsigned int num_points)
382 {
383 2 float* cPtr = cVector;
384 2 const float* aPtr = aVector;
385 2 const float* bPtr = bVector;
386 2 unsigned int number = 0;
387
388
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
389 262142 *cPtr++ = (*aPtr++) + (*bPtr++);
390 }
391 2 }
392 #endif /* LV_HAVE_GENERIC */
393
394
395 #ifdef LV_HAVE_ORC
396
397 extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
398 const float* aVector,
399 const float* bVector,
400 unsigned int num_points);
401
402 2 static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
403 const float* aVector,
404 const float* bVector,
405 unsigned int num_points)
406 {
407 2 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
408 2 }
409
410 #endif /* LV_HAVE_ORC */
411
412
413 #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
414