Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2019 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_accumulator_s32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Accumulates the values in the input buffer. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_32fc_accumulator_s32fc(lv_32fc_t* result, const lv_32fc_t* inputBuffer, | ||
20 | * unsigned int num_points) \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li inputBuffer: The buffer of data to be accumulated | ||
24 | * \li num_points: The number of data points. | ||
25 | * | ||
26 | * \b Outputs | ||
27 | * \li result: The accumulated result. | ||
28 | * | ||
29 | * \b Example | ||
30 | * Calculate the sum of numbers 0 through 99 | ||
31 | * \code | ||
32 | * int N = 100; | ||
33 | * unsigned int alignment = volk_get_alignment(); | ||
34 | * lv_32fc_t* vec = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
35 | * lv_32fc_t* out = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t), alignment); | ||
36 | * | ||
37 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
38 | * vec[ii] = lv_cmake( (float) ii, (float) -ii ); | ||
39 | * } | ||
40 | * | ||
41 | * volk_32fc_accumulator_s32fc(out, vec, N); | ||
42 | * | ||
43 | * printf("sum(0..99)+1j*sum(0..-99) = %1.2f %1.2f \n", lv_creal(*out) , lv_cimag(*out) | ||
44 | * ); | ||
45 | * | ||
46 | * volk_free(vec); | ||
47 | * volk_free(out); | ||
48 | * \endcode | ||
49 | */ | ||
50 | |||
51 | #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H | ||
52 | #define INCLUDED_volk_32fc_accumulator_s32fc_a_H | ||
53 | |||
54 | #include <inttypes.h> | ||
55 | #include <volk/volk_common.h> | ||
56 | |||
57 | #ifdef LV_HAVE_GENERIC | ||
58 | 2 | static inline void volk_32fc_accumulator_s32fc_generic(lv_32fc_t* result, | |
59 | const lv_32fc_t* inputBuffer, | ||
60 | unsigned int num_points) | ||
61 | { | ||
62 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
63 | 2 | unsigned int number = 0; | |
64 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
65 | |||
66 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
67 | 262142 | returnValue += (*aPtr++); | |
68 | } | ||
69 | 2 | *result = returnValue; | |
70 | 2 | } | |
71 | #endif /* LV_HAVE_GENERIC */ | ||
72 | |||
73 | #ifdef LV_HAVE_AVX | ||
74 | #include <immintrin.h> | ||
75 | |||
76 | 2 | static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result, | |
77 | const lv_32fc_t* inputBuffer, | ||
78 | unsigned int num_points) | ||
79 | { | ||
80 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
81 | 2 | unsigned int number = 0; | |
82 | 2 | const unsigned int quarterPoints = num_points / 4; | |
83 | |||
84 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
85 | __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; | ||
86 | |||
87 | 2 | __m256 accumulator = _mm256_setzero_ps(); | |
88 | 2 | __m256 aVal = _mm256_setzero_ps(); | |
89 | |||
90 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
91 | 65534 | aVal = _mm256_loadu_ps((float*)aPtr); | |
92 | 65534 | accumulator = _mm256_add_ps(accumulator, aVal); | |
93 | 65534 | aPtr += 4; | |
94 | } | ||
95 | |||
96 | _mm256_store_ps(tempBuffer, accumulator); | ||
97 | |||
98 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
99 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
100 | 2 | returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]); | |
101 | 2 | returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]); | |
102 | |||
103 | 2 | number = quarterPoints * 4; | |
104 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
105 | 6 | returnValue += (*aPtr++); | |
106 | } | ||
107 | 2 | *result = returnValue; | |
108 | 2 | } | |
109 | #endif /* LV_HAVE_AVX */ | ||
110 | |||
111 | #ifdef LV_HAVE_SSE | ||
112 | #include <xmmintrin.h> | ||
113 | |||
114 | 2 | static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result, | |
115 | const lv_32fc_t* inputBuffer, | ||
116 | unsigned int num_points) | ||
117 | { | ||
118 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
119 | 2 | unsigned int number = 0; | |
120 | 2 | const unsigned int halfPoints = num_points / 2; | |
121 | |||
122 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
123 | __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; | ||
124 | |||
125 | 2 | __m128 accumulator = _mm_setzero_ps(); | |
126 | 2 | __m128 aVal = _mm_setzero_ps(); | |
127 | |||
128 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
129 | 131070 | aVal = _mm_loadu_ps((float*)aPtr); | |
130 | 131070 | accumulator = _mm_add_ps(accumulator, aVal); | |
131 | 131070 | aPtr += 2; | |
132 | } | ||
133 | |||
134 | _mm_store_ps(tempBuffer, accumulator); | ||
135 | |||
136 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
137 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
138 | |||
139 | 2 | number = halfPoints * 2; | |
140 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
141 | 2 | returnValue += (*aPtr++); | |
142 | } | ||
143 | 2 | *result = returnValue; | |
144 | 2 | } | |
145 | #endif /* LV_HAVE_SSE */ | ||
146 | |||
147 | #ifdef LV_HAVE_AVX | ||
148 | #include <immintrin.h> | ||
149 | |||
150 | 2 | static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result, | |
151 | const lv_32fc_t* inputBuffer, | ||
152 | unsigned int num_points) | ||
153 | { | ||
154 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
155 | 2 | unsigned int number = 0; | |
156 | 2 | const unsigned int quarterPoints = num_points / 4; | |
157 | |||
158 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
159 | __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; | ||
160 | |||
161 | 2 | __m256 accumulator = _mm256_setzero_ps(); | |
162 | 2 | __m256 aVal = _mm256_setzero_ps(); | |
163 | |||
164 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
165 | 65534 | aVal = _mm256_load_ps((float*)aPtr); | |
166 | 65534 | accumulator = _mm256_add_ps(accumulator, aVal); | |
167 | 65534 | aPtr += 4; | |
168 | } | ||
169 | |||
170 | _mm256_store_ps(tempBuffer, accumulator); | ||
171 | |||
172 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
173 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
174 | 2 | returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]); | |
175 | 2 | returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]); | |
176 | |||
177 | 2 | number = quarterPoints * 4; | |
178 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
179 | 6 | returnValue += (*aPtr++); | |
180 | } | ||
181 | 2 | *result = returnValue; | |
182 | 2 | } | |
183 | #endif /* LV_HAVE_AVX */ | ||
184 | |||
185 | #ifdef LV_HAVE_SSE | ||
186 | #include <xmmintrin.h> | ||
187 | |||
188 | 2 | static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result, | |
189 | const lv_32fc_t* inputBuffer, | ||
190 | unsigned int num_points) | ||
191 | { | ||
192 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
193 | 2 | unsigned int number = 0; | |
194 | 2 | const unsigned int halfPoints = num_points / 2; | |
195 | |||
196 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
197 | __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; | ||
198 | |||
199 | 2 | __m128 accumulator = _mm_setzero_ps(); | |
200 | 2 | __m128 aVal = _mm_setzero_ps(); | |
201 | |||
202 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
203 | 131070 | aVal = _mm_load_ps((float*)aPtr); | |
204 | 131070 | accumulator = _mm_add_ps(accumulator, aVal); | |
205 | 131070 | aPtr += 2; | |
206 | } | ||
207 | |||
208 | _mm_store_ps(tempBuffer, accumulator); | ||
209 | |||
210 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
211 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
212 | |||
213 | 2 | number = halfPoints * 2; | |
214 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
215 | 2 | returnValue += (*aPtr++); | |
216 | } | ||
217 | 2 | *result = returnValue; | |
218 | 2 | } | |
219 | #endif /* LV_HAVE_SSE */ | ||
220 | |||
221 | #ifdef LV_HAVE_NEON | ||
222 | #include <arm_neon.h> | ||
223 | static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result, | ||
224 | const lv_32fc_t* inputBuffer, | ||
225 | unsigned int num_points) | ||
226 | { | ||
227 | const lv_32fc_t* aPtr = inputBuffer; | ||
228 | unsigned int number = 0; | ||
229 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | ||
230 | unsigned int eighthPoints = num_points / 8; | ||
231 | float32x4_t in_vec; | ||
232 | float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f }; | ||
233 | float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f }; | ||
234 | float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f }; | ||
235 | float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f }; | ||
236 | __VOLK_ATTR_ALIGNED(32) float tempBuffer[4]; | ||
237 | |||
238 | for (; number < eighthPoints; number++) { | ||
239 | in_vec = vld1q_f32((float*)aPtr); | ||
240 | out_vec0 = vaddq_f32(in_vec, out_vec0); | ||
241 | aPtr += 2; | ||
242 | |||
243 | in_vec = vld1q_f32((float*)aPtr); | ||
244 | out_vec1 = vaddq_f32(in_vec, out_vec1); | ||
245 | aPtr += 2; | ||
246 | |||
247 | in_vec = vld1q_f32((float*)aPtr); | ||
248 | out_vec2 = vaddq_f32(in_vec, out_vec2); | ||
249 | aPtr += 2; | ||
250 | |||
251 | in_vec = vld1q_f32((float*)aPtr); | ||
252 | out_vec3 = vaddq_f32(in_vec, out_vec3); | ||
253 | aPtr += 2; | ||
254 | } | ||
255 | vst1q_f32(tempBuffer, out_vec0); | ||
256 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
257 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
258 | |||
259 | vst1q_f32(tempBuffer, out_vec1); | ||
260 | returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
261 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
262 | |||
263 | vst1q_f32(tempBuffer, out_vec2); | ||
264 | returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
265 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
266 | |||
267 | vst1q_f32(tempBuffer, out_vec3); | ||
268 | returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
269 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
270 | |||
271 | number = eighthPoints * 8; | ||
272 | for (; number < num_points; number++) { | ||
273 | returnValue += (*aPtr++); | ||
274 | } | ||
275 | *result = returnValue; | ||
276 | } | ||
277 | #endif /* LV_HAVE_NEON */ | ||
278 | |||
279 | #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */ | ||
280 |