Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_x2_add_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Adds two vectors together element by element: | ||
16 | * | ||
17 | * c[i] = a[i] + b[i] | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, | ||
22 | * unsigned int num_points) \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li aVector: First vector of input points. | ||
26 | * \li bVector: Second vector of input points. | ||
27 | * \li num_points: The number of values in both input vector. | ||
28 | * | ||
29 | * \b Outputs | ||
30 | * \li cVector: The output vector. | ||
31 | * | ||
32 | * \b Example | ||
33 | * | ||
34 | * The follow example adds the increasing and decreasing vectors such that the result of | ||
35 | * every summation pair is 10 | ||
36 | * | ||
37 | * \code | ||
38 | * int N = 10; | ||
39 | * unsigned int alignment = volk_get_alignment(); | ||
40 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
41 | * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
42 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
43 | * | ||
44 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
45 | * increasing[ii] = (float)ii; | ||
46 | * decreasing[ii] = 10.f - (float)ii; | ||
47 | * } | ||
48 | * | ||
49 | * volk_32f_x2_add_32f(out, increasing, decreasing, N); | ||
50 | * | ||
51 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
52 | * printf("out[%u] = %1.2f\n", ii, out[ii]); | ||
53 | * } | ||
54 | * | ||
55 | * volk_free(increasing); | ||
56 | * volk_free(decreasing); | ||
57 | * volk_free(out); | ||
58 | * \endcode | ||
59 | */ | ||
60 | |||
61 | #ifndef INCLUDED_volk_32f_x2_add_32f_u_H | ||
62 | #define INCLUDED_volk_32f_x2_add_32f_u_H | ||
63 | |||
64 | #include <inttypes.h> | ||
65 | #include <stdio.h> | ||
66 | |||
67 | #ifdef LV_HAVE_AVX512F | ||
68 | #include <immintrin.h> | ||
69 | |||
70 | ✗ | static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector, | |
71 | const float* aVector, | ||
72 | const float* bVector, | ||
73 | unsigned int num_points) | ||
74 | { | ||
75 | ✗ | unsigned int number = 0; | |
76 | ✗ | const unsigned int sixteenthPoints = num_points / 16; | |
77 | |||
78 | ✗ | float* cPtr = cVector; | |
79 | ✗ | const float* aPtr = aVector; | |
80 | ✗ | const float* bPtr = bVector; | |
81 | |||
82 | __m512 aVal, bVal, cVal; | ||
83 | ✗ | for (; number < sixteenthPoints; number++) { | |
84 | |||
85 | ✗ | aVal = _mm512_loadu_ps(aPtr); | |
86 | ✗ | bVal = _mm512_loadu_ps(bPtr); | |
87 | |||
88 | ✗ | cVal = _mm512_add_ps(aVal, bVal); | |
89 | |||
90 | _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
91 | |||
92 | ✗ | aPtr += 16; | |
93 | ✗ | bPtr += 16; | |
94 | ✗ | cPtr += 16; | |
95 | } | ||
96 | |||
97 | ✗ | number = sixteenthPoints * 16; | |
98 | |||
99 | ✗ | for (; number < num_points; number++) { | |
100 | ✗ | *cPtr++ = (*aPtr++) + (*bPtr++); | |
101 | } | ||
102 | ✗ | } | |
103 | |||
104 | #endif /* LV_HAVE_AVX512F */ | ||
105 | |||
106 | |||
107 | #ifdef LV_HAVE_AVX | ||
108 | #include <immintrin.h> | ||
109 | |||
110 | 2 | static inline void volk_32f_x2_add_32f_u_avx(float* cVector, | |
111 | const float* aVector, | ||
112 | const float* bVector, | ||
113 | unsigned int num_points) | ||
114 | { | ||
115 | 2 | unsigned int number = 0; | |
116 | 2 | const unsigned int eighthPoints = num_points / 8; | |
117 | 2 | float* cPtr = cVector; | |
118 | 2 | const float* aPtr = aVector; | |
119 | 2 | const float* bPtr = bVector; | |
120 | __m256 aVal, bVal, cVal; | ||
121 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
122 | |||
123 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
124 | 32766 | bVal = _mm256_loadu_ps(bPtr); | |
125 | |||
126 | 32766 | cVal = _mm256_add_ps(aVal, bVal); | |
127 | |||
128 | _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
129 | |||
130 | 32766 | aPtr += 8; | |
131 | 32766 | bPtr += 8; | |
132 | 32766 | cPtr += 8; | |
133 | } | ||
134 | |||
135 | 2 | number = eighthPoints * 8; | |
136 | |||
137 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
138 | 14 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
139 | } | ||
140 | 2 | } | |
141 | #endif /* LV_HAVE_AVX */ | ||
142 | |||
143 | |||
144 | #ifdef LV_HAVE_SSE | ||
145 | #include <xmmintrin.h> | ||
146 | |||
147 | 2 | static inline void volk_32f_x2_add_32f_u_sse(float* cVector, | |
148 | const float* aVector, | ||
149 | const float* bVector, | ||
150 | unsigned int num_points) | ||
151 | { | ||
152 | 2 | unsigned int number = 0; | |
153 | 2 | const unsigned int quarterPoints = num_points / 4; | |
154 | |||
155 | 2 | float* cPtr = cVector; | |
156 | 2 | const float* aPtr = aVector; | |
157 | 2 | const float* bPtr = bVector; | |
158 | |||
159 | __m128 aVal, bVal, cVal; | ||
160 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
161 | |||
162 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
163 | 65534 | bVal = _mm_loadu_ps(bPtr); | |
164 | |||
165 | 65534 | cVal = _mm_add_ps(aVal, bVal); | |
166 | |||
167 | _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
168 | |||
169 | 65534 | aPtr += 4; | |
170 | 65534 | bPtr += 4; | |
171 | 65534 | cPtr += 4; | |
172 | } | ||
173 | |||
174 | 2 | number = quarterPoints * 4; | |
175 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
176 | 6 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
177 | } | ||
178 | 2 | } | |
179 | #endif /* LV_HAVE_SSE */ | ||
180 | |||
181 | |||
182 | #ifdef LV_HAVE_GENERIC | ||
183 | |||
184 | 2 | static inline void volk_32f_x2_add_32f_generic(float* cVector, | |
185 | const float* aVector, | ||
186 | const float* bVector, | ||
187 | unsigned int num_points) | ||
188 | { | ||
189 | 2 | float* cPtr = cVector; | |
190 | 2 | const float* aPtr = aVector; | |
191 | 2 | const float* bPtr = bVector; | |
192 | 2 | unsigned int number = 0; | |
193 | |||
194 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
195 | 262142 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
196 | } | ||
197 | 2 | } | |
198 | #endif /* LV_HAVE_GENERIC */ | ||
199 | |||
200 | |||
201 | #endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ | ||
202 | #ifndef INCLUDED_volk_32f_x2_add_32f_a_H | ||
203 | #define INCLUDED_volk_32f_x2_add_32f_a_H | ||
204 | |||
205 | #include <inttypes.h> | ||
206 | #include <stdio.h> | ||
207 | |||
208 | #ifdef LV_HAVE_AVX512F | ||
209 | #include <immintrin.h> | ||
210 | |||
211 | ✗ | static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector, | |
212 | const float* aVector, | ||
213 | const float* bVector, | ||
214 | unsigned int num_points) | ||
215 | { | ||
216 | ✗ | unsigned int number = 0; | |
217 | ✗ | const unsigned int sixteenthPoints = num_points / 16; | |
218 | |||
219 | ✗ | float* cPtr = cVector; | |
220 | ✗ | const float* aPtr = aVector; | |
221 | ✗ | const float* bPtr = bVector; | |
222 | |||
223 | __m512 aVal, bVal, cVal; | ||
224 | ✗ | for (; number < sixteenthPoints; number++) { | |
225 | |||
226 | ✗ | aVal = _mm512_load_ps(aPtr); | |
227 | ✗ | bVal = _mm512_load_ps(bPtr); | |
228 | |||
229 | ✗ | cVal = _mm512_add_ps(aVal, bVal); | |
230 | |||
231 | _mm512_store_ps(cPtr, cVal); // Store the results back into the C container | ||
232 | |||
233 | ✗ | aPtr += 16; | |
234 | ✗ | bPtr += 16; | |
235 | ✗ | cPtr += 16; | |
236 | } | ||
237 | |||
238 | ✗ | number = sixteenthPoints * 16; | |
239 | |||
240 | ✗ | for (; number < num_points; number++) { | |
241 | ✗ | *cPtr++ = (*aPtr++) + (*bPtr++); | |
242 | } | ||
243 | ✗ | } | |
244 | |||
245 | #endif /* LV_HAVE_AVX512F */ | ||
246 | |||
247 | |||
248 | #ifdef LV_HAVE_AVX | ||
249 | #include <immintrin.h> | ||
250 | |||
251 | 2 | static inline void volk_32f_x2_add_32f_a_avx(float* cVector, | |
252 | const float* aVector, | ||
253 | const float* bVector, | ||
254 | unsigned int num_points) | ||
255 | { | ||
256 | 2 | unsigned int number = 0; | |
257 | 2 | const unsigned int eighthPoints = num_points / 8; | |
258 | |||
259 | 2 | float* cPtr = cVector; | |
260 | 2 | const float* aPtr = aVector; | |
261 | 2 | const float* bPtr = bVector; | |
262 | |||
263 | __m256 aVal, bVal, cVal; | ||
264 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
265 | |||
266 | 32766 | aVal = _mm256_load_ps(aPtr); | |
267 | 32766 | bVal = _mm256_load_ps(bPtr); | |
268 | |||
269 | 32766 | cVal = _mm256_add_ps(aVal, bVal); | |
270 | |||
271 | _mm256_store_ps(cPtr, cVal); // Store the results back into the C container | ||
272 | |||
273 | 32766 | aPtr += 8; | |
274 | 32766 | bPtr += 8; | |
275 | 32766 | cPtr += 8; | |
276 | } | ||
277 | |||
278 | 2 | number = eighthPoints * 8; | |
279 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
280 | 14 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
281 | } | ||
282 | 2 | } | |
283 | #endif /* LV_HAVE_AVX */ | ||
284 | |||
285 | #ifdef LV_HAVE_SSE | ||
286 | #include <xmmintrin.h> | ||
287 | |||
288 | 2 | static inline void volk_32f_x2_add_32f_a_sse(float* cVector, | |
289 | const float* aVector, | ||
290 | const float* bVector, | ||
291 | unsigned int num_points) | ||
292 | { | ||
293 | 2 | unsigned int number = 0; | |
294 | 2 | const unsigned int quarterPoints = num_points / 4; | |
295 | |||
296 | 2 | float* cPtr = cVector; | |
297 | 2 | const float* aPtr = aVector; | |
298 | 2 | const float* bPtr = bVector; | |
299 | |||
300 | __m128 aVal, bVal, cVal; | ||
301 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
302 | 65534 | aVal = _mm_load_ps(aPtr); | |
303 | 65534 | bVal = _mm_load_ps(bPtr); | |
304 | |||
305 | 65534 | cVal = _mm_add_ps(aVal, bVal); | |
306 | |||
307 | _mm_store_ps(cPtr, cVal); // Store the results back into the C container | ||
308 | |||
309 | 65534 | aPtr += 4; | |
310 | 65534 | bPtr += 4; | |
311 | 65534 | cPtr += 4; | |
312 | } | ||
313 | |||
314 | 2 | number = quarterPoints * 4; | |
315 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
316 | 6 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
317 | } | ||
318 | 2 | } | |
319 | #endif /* LV_HAVE_SSE */ | ||
320 | |||
321 | |||
322 | #ifdef LV_HAVE_NEON | ||
323 | #include <arm_neon.h> | ||
324 | |||
325 | static inline void volk_32f_x2_add_32f_u_neon(float* cVector, | ||
326 | const float* aVector, | ||
327 | const float* bVector, | ||
328 | unsigned int num_points) | ||
329 | { | ||
330 | unsigned int number = 0; | ||
331 | const unsigned int quarterPoints = num_points / 4; | ||
332 | |||
333 | float* cPtr = cVector; | ||
334 | const float* aPtr = aVector; | ||
335 | const float* bPtr = bVector; | ||
336 | float32x4_t aVal, bVal, cVal; | ||
337 | for (number = 0; number < quarterPoints; number++) { | ||
338 | // Load in to NEON registers | ||
339 | aVal = vld1q_f32(aPtr); | ||
340 | bVal = vld1q_f32(bPtr); | ||
341 | __VOLK_PREFETCH(aPtr + 4); | ||
342 | __VOLK_PREFETCH(bPtr + 4); | ||
343 | |||
344 | // vector add | ||
345 | cVal = vaddq_f32(aVal, bVal); | ||
346 | // Store the results back into the C container | ||
347 | vst1q_f32(cPtr, cVal); | ||
348 | |||
349 | aPtr += 4; // q uses quadwords, 4 floats per vadd | ||
350 | bPtr += 4; | ||
351 | cPtr += 4; | ||
352 | } | ||
353 | |||
354 | number = quarterPoints * 4; // should be = num_points | ||
355 | for (; number < num_points; number++) { | ||
356 | *cPtr++ = (*aPtr++) + (*bPtr++); | ||
357 | } | ||
358 | } | ||
359 | |||
360 | #endif /* LV_HAVE_NEON */ | ||
361 | |||
362 | #ifdef LV_HAVE_NEONV7 | ||
363 | extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, | ||
364 | const float* aVector, | ||
365 | const float* bVector, | ||
366 | unsigned int num_points); | ||
367 | #endif /* LV_HAVE_NEONV7 */ | ||
368 | |||
369 | #ifdef LV_HAVE_NEONV7 | ||
370 | extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, | ||
371 | const float* aVector, | ||
372 | const float* bVector, | ||
373 | unsigned int num_points); | ||
374 | #endif /* LV_HAVE_NEONV7 */ | ||
375 | |||
376 | #ifdef LV_HAVE_GENERIC | ||
377 | |||
378 | 2 | static inline void volk_32f_x2_add_32f_a_generic(float* cVector, | |
379 | const float* aVector, | ||
380 | const float* bVector, | ||
381 | unsigned int num_points) | ||
382 | { | ||
383 | 2 | float* cPtr = cVector; | |
384 | 2 | const float* aPtr = aVector; | |
385 | 2 | const float* bPtr = bVector; | |
386 | 2 | unsigned int number = 0; | |
387 | |||
388 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
389 | 262142 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
390 | } | ||
391 | 2 | } | |
392 | #endif /* LV_HAVE_GENERIC */ | ||
393 | |||
394 | |||
395 | #ifdef LV_HAVE_ORC | ||
396 | |||
397 | extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, | ||
398 | const float* aVector, | ||
399 | const float* bVector, | ||
400 | unsigned int num_points); | ||
401 | |||
402 | 2 | static inline void volk_32f_x2_add_32f_u_orc(float* cVector, | |
403 | const float* aVector, | ||
404 | const float* bVector, | ||
405 | unsigned int num_points) | ||
406 | { | ||
407 | 2 | volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); | |
408 | 2 | } | |
409 | |||
410 | #endif /* LV_HAVE_ORC */ | ||
411 | |||
412 | |||
413 | #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */ | ||
414 |