Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2018 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_add_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Adds two vectors together element by element: | ||
16 | * | ||
17 | * c[i] = a[i] + b[i] | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const | ||
22 | * lv_32fc_t* bVector, unsigned int num_points) \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li aVector: First vector of input points. | ||
26 | * \li bVector: Second vector of input points. | ||
27 | * \li num_points: The number of values in both input vector. | ||
28 | * | ||
29 | * \b Outputs | ||
30 | * \li cVector: The output vector. | ||
31 | * | ||
32 | * \b Example | ||
33 | * | ||
34 | * The follow example adds the increasing and decreasing vectors such that the result of | ||
35 | * every summation pair is 10 | ||
36 | * | ||
37 | * \code | ||
38 | * int N = 10; | ||
39 | * unsigned int alignment = volk_get_alignment(); | ||
40 | * lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
41 | * lv_32fc_t* decreasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
42 | * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
43 | * | ||
44 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
45 | * increasing[ii] = (lv_32fc_t)ii; | ||
46 | * decreasing[ii] = 10.f - (lv_32fc_t)ii; | ||
47 | * } | ||
48 | * | ||
49 | * volk_32fc_x2_add_32fc(out, increasing, decreasing, N); | ||
50 | * | ||
51 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
52 | * printf("out[%u] = %1.2f\n", ii, out[ii]); | ||
53 | * } | ||
54 | * | ||
55 | * volk_free(increasing); | ||
56 | * volk_free(decreasing); | ||
57 | * volk_free(out); | ||
58 | * \endcode | ||
59 | */ | ||
60 | |||
61 | #ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H | ||
62 | #define INCLUDED_volk_32fc_x2_add_32fc_u_H | ||
63 | |||
64 | #ifdef LV_HAVE_AVX | ||
65 | #include <immintrin.h> | ||
66 | |||
67 | 2 | static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, | |
68 | const lv_32fc_t* aVector, | ||
69 | const lv_32fc_t* bVector, | ||
70 | unsigned int num_points) | ||
71 | { | ||
72 | 2 | unsigned int number = 0; | |
73 | 2 | const unsigned int quarterPoints = num_points / 4; | |
74 | |||
75 | 2 | lv_32fc_t* cPtr = cVector; | |
76 | 2 | const lv_32fc_t* aPtr = aVector; | |
77 | 2 | const lv_32fc_t* bPtr = bVector; | |
78 | |||
79 | __m256 aVal, bVal, cVal; | ||
80 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
81 | |||
82 | 65534 | aVal = _mm256_loadu_ps((float*)aPtr); | |
83 | 65534 | bVal = _mm256_loadu_ps((float*)bPtr); | |
84 | |||
85 | 65534 | cVal = _mm256_add_ps(aVal, bVal); | |
86 | |||
87 | _mm256_storeu_ps((float*)cPtr, | ||
88 | cVal); // Store the results back into the C container | ||
89 | |||
90 | 65534 | aPtr += 4; | |
91 | 65534 | bPtr += 4; | |
92 | 65534 | cPtr += 4; | |
93 | } | ||
94 | |||
95 | 2 | number = quarterPoints * 4; | |
96 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
97 | 6 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
98 | } | ||
99 | 2 | } | |
100 | #endif /* LV_HAVE_AVX */ | ||
101 | |||
102 | |||
103 | #ifdef LV_HAVE_AVX | ||
104 | #include <immintrin.h> | ||
105 | |||
106 | 2 | static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, | |
107 | const lv_32fc_t* aVector, | ||
108 | const lv_32fc_t* bVector, | ||
109 | unsigned int num_points) | ||
110 | { | ||
111 | 2 | unsigned int number = 0; | |
112 | 2 | const unsigned int quarterPoints = num_points / 4; | |
113 | |||
114 | 2 | lv_32fc_t* cPtr = cVector; | |
115 | 2 | const lv_32fc_t* aPtr = aVector; | |
116 | 2 | const lv_32fc_t* bPtr = bVector; | |
117 | |||
118 | __m256 aVal, bVal, cVal; | ||
119 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
120 | |||
121 | 65534 | aVal = _mm256_load_ps((float*)aPtr); | |
122 | 65534 | bVal = _mm256_load_ps((float*)bPtr); | |
123 | |||
124 | 65534 | cVal = _mm256_add_ps(aVal, bVal); | |
125 | |||
126 | _mm256_store_ps((float*)cPtr, | ||
127 | cVal); // Store the results back into the C container | ||
128 | |||
129 | 65534 | aPtr += 4; | |
130 | 65534 | bPtr += 4; | |
131 | 65534 | cPtr += 4; | |
132 | } | ||
133 | |||
134 | 2 | number = quarterPoints * 4; | |
135 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
136 | 6 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
137 | } | ||
138 | 2 | } | |
139 | #endif /* LV_HAVE_AVX */ | ||
140 | |||
141 | |||
142 | #ifdef LV_HAVE_SSE | ||
143 | #include <xmmintrin.h> | ||
144 | |||
145 | 2 | static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, | |
146 | const lv_32fc_t* aVector, | ||
147 | const lv_32fc_t* bVector, | ||
148 | unsigned int num_points) | ||
149 | { | ||
150 | 2 | unsigned int number = 0; | |
151 | 2 | const unsigned int halfPoints = num_points / 2; | |
152 | |||
153 | 2 | lv_32fc_t* cPtr = cVector; | |
154 | 2 | const lv_32fc_t* aPtr = aVector; | |
155 | 2 | const lv_32fc_t* bPtr = bVector; | |
156 | |||
157 | __m128 aVal, bVal, cVal; | ||
158 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
159 | |||
160 | 131070 | aVal = _mm_loadu_ps((float*)aPtr); | |
161 | 131070 | bVal = _mm_loadu_ps((float*)bPtr); | |
162 | |||
163 | 131070 | cVal = _mm_add_ps(aVal, bVal); | |
164 | |||
165 | _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container | ||
166 | |||
167 | 131070 | aPtr += 2; | |
168 | 131070 | bPtr += 2; | |
169 | 131070 | cPtr += 2; | |
170 | } | ||
171 | |||
172 | 2 | number = halfPoints * 2; | |
173 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
174 | 2 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
175 | } | ||
176 | 2 | } | |
177 | #endif /* LV_HAVE_SSE */ | ||
178 | |||
179 | |||
180 | #ifdef LV_HAVE_GENERIC | ||
181 | |||
182 | 2 | static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, | |
183 | const lv_32fc_t* aVector, | ||
184 | const lv_32fc_t* bVector, | ||
185 | unsigned int num_points) | ||
186 | { | ||
187 | 2 | lv_32fc_t* cPtr = cVector; | |
188 | 2 | const lv_32fc_t* aPtr = aVector; | |
189 | 2 | const lv_32fc_t* bPtr = bVector; | |
190 | 2 | unsigned int number = 0; | |
191 | |||
192 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
193 | 262142 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
194 | } | ||
195 | 2 | } | |
196 | #endif /* LV_HAVE_GENERIC */ | ||
197 | |||
198 | |||
199 | #ifdef LV_HAVE_SSE | ||
200 | #include <xmmintrin.h> | ||
201 | |||
202 | 2 | static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, | |
203 | const lv_32fc_t* aVector, | ||
204 | const lv_32fc_t* bVector, | ||
205 | unsigned int num_points) | ||
206 | { | ||
207 | 2 | unsigned int number = 0; | |
208 | 2 | const unsigned int halfPoints = num_points / 2; | |
209 | |||
210 | 2 | lv_32fc_t* cPtr = cVector; | |
211 | 2 | const lv_32fc_t* aPtr = aVector; | |
212 | 2 | const lv_32fc_t* bPtr = bVector; | |
213 | |||
214 | __m128 aVal, bVal, cVal; | ||
215 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
216 | 131070 | aVal = _mm_load_ps((float*)aPtr); | |
217 | 131070 | bVal = _mm_load_ps((float*)bPtr); | |
218 | |||
219 | 131070 | cVal = _mm_add_ps(aVal, bVal); | |
220 | |||
221 | _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container | ||
222 | |||
223 | 131070 | aPtr += 2; | |
224 | 131070 | bPtr += 2; | |
225 | 131070 | cPtr += 2; | |
226 | } | ||
227 | |||
228 | 2 | number = halfPoints * 2; | |
229 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
230 | 2 | *cPtr++ = (*aPtr++) + (*bPtr++); | |
231 | } | ||
232 | 2 | } | |
233 | #endif /* LV_HAVE_SSE */ | ||
234 | |||
235 | |||
236 | #ifdef LV_HAVE_NEON | ||
237 | #include <arm_neon.h> | ||
238 | |||
239 | static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, | ||
240 | const lv_32fc_t* aVector, | ||
241 | const lv_32fc_t* bVector, | ||
242 | unsigned int num_points) | ||
243 | { | ||
244 | unsigned int number = 0; | ||
245 | const unsigned int halfPoints = num_points / 2; | ||
246 | |||
247 | lv_32fc_t* cPtr = cVector; | ||
248 | const lv_32fc_t* aPtr = aVector; | ||
249 | const lv_32fc_t* bPtr = bVector; | ||
250 | float32x4_t aVal, bVal, cVal; | ||
251 | for (number = 0; number < halfPoints; number++) { | ||
252 | // Load in to NEON registers | ||
253 | aVal = vld1q_f32((const float32_t*)(aPtr)); | ||
254 | bVal = vld1q_f32((const float32_t*)(bPtr)); | ||
255 | __VOLK_PREFETCH(aPtr + 2); | ||
256 | __VOLK_PREFETCH(bPtr + 2); | ||
257 | |||
258 | // vector add | ||
259 | cVal = vaddq_f32(aVal, bVal); | ||
260 | // Store the results back into the C container | ||
261 | vst1q_f32((float*)(cPtr), cVal); | ||
262 | |||
263 | aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd | ||
264 | bPtr += 2; | ||
265 | cPtr += 2; | ||
266 | } | ||
267 | |||
268 | number = halfPoints * 2; // should be = num_points | ||
269 | for (; number < num_points; number++) { | ||
270 | *cPtr++ = (*aPtr++) + (*bPtr++); | ||
271 | } | ||
272 | } | ||
273 | |||
274 | #endif /* LV_HAVE_NEON */ | ||
275 | |||
276 | |||
277 | #endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */ | ||
278 |