GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_accumulator_s32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 84 84 100.0%
Functions: 5 5 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_accumulator_s32fc
12 *
13 * \b Overview
14 *
15 * Accumulates the values in the input buffer.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32fc_accumulator_s32fc(lv_32fc_t* result, const lv_32fc_t* inputBuffer,
20 * unsigned int num_points) \endcode
21 *
22 * \b Inputs
23 * \li inputBuffer: The buffer of data to be accumulated
24 * \li num_points: The number of data points.
25 *
26 * \b Outputs
27 * \li result: The accumulated result.
28 *
29 * \b Example
30 * Calculate the sum of numbers 0 through 99
31 * \code
32 * int N = 100;
33 * unsigned int alignment = volk_get_alignment();
34 * lv_32fc_t* vec = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t)*N, alignment);
35 * lv_32fc_t* out = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t), alignment);
36 *
37 * for(unsigned int ii = 0; ii < N; ++ii){
38 * vec[ii] = lv_cmake( (float) ii, (float) -ii );
39 * }
40 *
41 * volk_32fc_accumulator_s32fc(out, vec, N);
42 *
43 * printf("sum(0..99)+1j*sum(0..-99) = %1.2f %1.2f \n", lv_creal(*out) , lv_cimag(*out)
44 * );
45 *
46 * volk_free(vec);
47 * volk_free(out);
48 * \endcode
49 */
50
51 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
53
54 #include <inttypes.h>
55 #include <volk/volk_common.h>
56
57 #ifdef LV_HAVE_GENERIC
58 2 static inline void volk_32fc_accumulator_s32fc_generic(lv_32fc_t* result,
59 const lv_32fc_t* inputBuffer,
60 unsigned int num_points)
61 {
62 2 const lv_32fc_t* aPtr = inputBuffer;
63 2 unsigned int number = 0;
64 2 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
65
66
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; number < num_points; number++) {
67 262142 returnValue += (*aPtr++);
68 }
69 2 *result = returnValue;
70 2 }
71 #endif /* LV_HAVE_GENERIC */
72
73 #ifdef LV_HAVE_AVX
74 #include <immintrin.h>
75
76 2 static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result,
77 const lv_32fc_t* inputBuffer,
78 unsigned int num_points)
79 {
80 2 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
81 2 unsigned int number = 0;
82 2 const unsigned int quarterPoints = num_points / 4;
83
84 2 const lv_32fc_t* aPtr = inputBuffer;
85 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
86
87 2 __m256 accumulator = _mm256_setzero_ps();
88 2 __m256 aVal = _mm256_setzero_ps();
89
90
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
91 65534 aVal = _mm256_loadu_ps((float*)aPtr);
92 65534 accumulator = _mm256_add_ps(accumulator, aVal);
93 65534 aPtr += 4;
94 }
95
96 _mm256_store_ps(tempBuffer, accumulator);
97
98 2 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
99 2 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
100 2 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
101 2 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
102
103 2 number = quarterPoints * 4;
104
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
105 6 returnValue += (*aPtr++);
106 }
107 2 *result = returnValue;
108 2 }
109 #endif /* LV_HAVE_AVX */
110
111 #ifdef LV_HAVE_SSE
112 #include <xmmintrin.h>
113
114 2 static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result,
115 const lv_32fc_t* inputBuffer,
116 unsigned int num_points)
117 {
118 2 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
119 2 unsigned int number = 0;
120 2 const unsigned int halfPoints = num_points / 2;
121
122 2 const lv_32fc_t* aPtr = inputBuffer;
123 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
124
125 2 __m128 accumulator = _mm_setzero_ps();
126 2 __m128 aVal = _mm_setzero_ps();
127
128
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
129 131070 aVal = _mm_loadu_ps((float*)aPtr);
130 131070 accumulator = _mm_add_ps(accumulator, aVal);
131 131070 aPtr += 2;
132 }
133
134 _mm_store_ps(tempBuffer, accumulator);
135
136 2 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
137 2 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
138
139 2 number = halfPoints * 2;
140
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; number < num_points; number++) {
141 2 returnValue += (*aPtr++);
142 }
143 2 *result = returnValue;
144 2 }
145 #endif /* LV_HAVE_SSE */
146
147 #ifdef LV_HAVE_AVX
148 #include <immintrin.h>
149
150 2 static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result,
151 const lv_32fc_t* inputBuffer,
152 unsigned int num_points)
153 {
154 2 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
155 2 unsigned int number = 0;
156 2 const unsigned int quarterPoints = num_points / 4;
157
158 2 const lv_32fc_t* aPtr = inputBuffer;
159 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
160
161 2 __m256 accumulator = _mm256_setzero_ps();
162 2 __m256 aVal = _mm256_setzero_ps();
163
164
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
165 65534 aVal = _mm256_load_ps((float*)aPtr);
166 65534 accumulator = _mm256_add_ps(accumulator, aVal);
167 65534 aPtr += 4;
168 }
169
170 _mm256_store_ps(tempBuffer, accumulator);
171
172 2 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
173 2 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
174 2 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
175 2 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
176
177 2 number = quarterPoints * 4;
178
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
179 6 returnValue += (*aPtr++);
180 }
181 2 *result = returnValue;
182 2 }
183 #endif /* LV_HAVE_AVX */
184
185 #ifdef LV_HAVE_SSE
186 #include <xmmintrin.h>
187
188 2 static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result,
189 const lv_32fc_t* inputBuffer,
190 unsigned int num_points)
191 {
192 2 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
193 2 unsigned int number = 0;
194 2 const unsigned int halfPoints = num_points / 2;
195
196 2 const lv_32fc_t* aPtr = inputBuffer;
197 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
198
199 2 __m128 accumulator = _mm_setzero_ps();
200 2 __m128 aVal = _mm_setzero_ps();
201
202
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
203 131070 aVal = _mm_load_ps((float*)aPtr);
204 131070 accumulator = _mm_add_ps(accumulator, aVal);
205 131070 aPtr += 2;
206 }
207
208 _mm_store_ps(tempBuffer, accumulator);
209
210 2 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
211 2 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
212
213 2 number = halfPoints * 2;
214
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; number < num_points; number++) {
215 2 returnValue += (*aPtr++);
216 }
217 2 *result = returnValue;
218 2 }
219 #endif /* LV_HAVE_SSE */
220
221 #ifdef LV_HAVE_NEON
222 #include <arm_neon.h>
223 static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
224 const lv_32fc_t* inputBuffer,
225 unsigned int num_points)
226 {
227 const lv_32fc_t* aPtr = inputBuffer;
228 unsigned int number = 0;
229 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
230 unsigned int eighthPoints = num_points / 8;
231 float32x4_t in_vec;
232 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
233 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
234 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
235 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
236 __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
237
238 for (; number < eighthPoints; number++) {
239 in_vec = vld1q_f32((float*)aPtr);
240 out_vec0 = vaddq_f32(in_vec, out_vec0);
241 aPtr += 2;
242
243 in_vec = vld1q_f32((float*)aPtr);
244 out_vec1 = vaddq_f32(in_vec, out_vec1);
245 aPtr += 2;
246
247 in_vec = vld1q_f32((float*)aPtr);
248 out_vec2 = vaddq_f32(in_vec, out_vec2);
249 aPtr += 2;
250
251 in_vec = vld1q_f32((float*)aPtr);
252 out_vec3 = vaddq_f32(in_vec, out_vec3);
253 aPtr += 2;
254 }
255 vst1q_f32(tempBuffer, out_vec0);
256 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
257 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
258
259 vst1q_f32(tempBuffer, out_vec1);
260 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
261 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
262
263 vst1q_f32(tempBuffer, out_vec2);
264 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
265 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
266
267 vst1q_f32(tempBuffer, out_vec3);
268 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
269 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
270
271 number = eighthPoints * 8;
272 for (; number < num_points; number++) {
273 returnValue += (*aPtr++);
274 }
275 *result = returnValue;
276 }
277 #endif /* LV_HAVE_NEON */
278
279 #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
280