GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_x2_add_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 76 76 100.0%
Functions: 5 5 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_x2_add_32fc
12 *
13 * \b Overview
14 *
15 * Adds two vectors together element by element:
16 *
17 * c[i] = a[i] + b[i]
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
22 * lv_32fc_t* bVector, unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li aVector: First vector of input points.
26 * \li bVector: Second vector of input points.
27 * \li num_points: The number of values in both input vector.
28 *
29 * \b Outputs
30 * \li cVector: The output vector.
31 *
32 * \b Example
33 *
34 * The follow example adds the increasing and decreasing vectors such that the result of
35 * every summation pair is 10
36 *
37 * \code
38 * int N = 10;
39 * unsigned int alignment = volk_get_alignment();
40 * lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
41 * lv_32fc_t* decreasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
42 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
43 *
44 * for(unsigned int ii = 0; ii < N; ++ii){
45 * increasing[ii] = (lv_32fc_t)ii;
46 * decreasing[ii] = 10.f - (lv_32fc_t)ii;
47 * }
48 *
49 * volk_32fc_x2_add_32fc(out, increasing, decreasing, N);
50 *
51 * for(unsigned int ii = 0; ii < N; ++ii){
52 * printf("out[%u] = %1.2f\n", ii, out[ii]);
53 * }
54 *
55 * volk_free(increasing);
56 * volk_free(decreasing);
57 * volk_free(out);
58 * \endcode
59 */
60
61 #ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
62 #define INCLUDED_volk_32fc_x2_add_32fc_u_H
63
64 #ifdef LV_HAVE_AVX
65 #include <immintrin.h>
66
67 2 static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
68 const lv_32fc_t* aVector,
69 const lv_32fc_t* bVector,
70 unsigned int num_points)
71 {
72 2 unsigned int number = 0;
73 2 const unsigned int quarterPoints = num_points / 4;
74
75 2 lv_32fc_t* cPtr = cVector;
76 2 const lv_32fc_t* aPtr = aVector;
77 2 const lv_32fc_t* bPtr = bVector;
78
79 __m256 aVal, bVal, cVal;
80
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
81
82 65534 aVal = _mm256_loadu_ps((float*)aPtr);
83 65534 bVal = _mm256_loadu_ps((float*)bPtr);
84
85 65534 cVal = _mm256_add_ps(aVal, bVal);
86
87 _mm256_storeu_ps((float*)cPtr,
88 cVal); // Store the results back into the C container
89
90 65534 aPtr += 4;
91 65534 bPtr += 4;
92 65534 cPtr += 4;
93 }
94
95 2 number = quarterPoints * 4;
96
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
97 6 *cPtr++ = (*aPtr++) + (*bPtr++);
98 }
99 2 }
100 #endif /* LV_HAVE_AVX */
101
102
103 #ifdef LV_HAVE_AVX
104 #include <immintrin.h>
105
106 2 static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
107 const lv_32fc_t* aVector,
108 const lv_32fc_t* bVector,
109 unsigned int num_points)
110 {
111 2 unsigned int number = 0;
112 2 const unsigned int quarterPoints = num_points / 4;
113
114 2 lv_32fc_t* cPtr = cVector;
115 2 const lv_32fc_t* aPtr = aVector;
116 2 const lv_32fc_t* bPtr = bVector;
117
118 __m256 aVal, bVal, cVal;
119
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
120
121 65534 aVal = _mm256_load_ps((float*)aPtr);
122 65534 bVal = _mm256_load_ps((float*)bPtr);
123
124 65534 cVal = _mm256_add_ps(aVal, bVal);
125
126 _mm256_store_ps((float*)cPtr,
127 cVal); // Store the results back into the C container
128
129 65534 aPtr += 4;
130 65534 bPtr += 4;
131 65534 cPtr += 4;
132 }
133
134 2 number = quarterPoints * 4;
135
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
136 6 *cPtr++ = (*aPtr++) + (*bPtr++);
137 }
138 2 }
139 #endif /* LV_HAVE_AVX */
140
141
142 #ifdef LV_HAVE_SSE
143 #include <xmmintrin.h>
144
145 2 static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
146 const lv_32fc_t* aVector,
147 const lv_32fc_t* bVector,
148 unsigned int num_points)
149 {
150 2 unsigned int number = 0;
151 2 const unsigned int halfPoints = num_points / 2;
152
153 2 lv_32fc_t* cPtr = cVector;
154 2 const lv_32fc_t* aPtr = aVector;
155 2 const lv_32fc_t* bPtr = bVector;
156
157 __m128 aVal, bVal, cVal;
158
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
159
160 131070 aVal = _mm_loadu_ps((float*)aPtr);
161 131070 bVal = _mm_loadu_ps((float*)bPtr);
162
163 131070 cVal = _mm_add_ps(aVal, bVal);
164
165 _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
166
167 131070 aPtr += 2;
168 131070 bPtr += 2;
169 131070 cPtr += 2;
170 }
171
172 2 number = halfPoints * 2;
173
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; number < num_points; number++) {
174 2 *cPtr++ = (*aPtr++) + (*bPtr++);
175 }
176 2 }
177 #endif /* LV_HAVE_SSE */
178
179
180 #ifdef LV_HAVE_GENERIC
181
182 2 static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
183 const lv_32fc_t* aVector,
184 const lv_32fc_t* bVector,
185 unsigned int num_points)
186 {
187 2 lv_32fc_t* cPtr = cVector;
188 2 const lv_32fc_t* aPtr = aVector;
189 2 const lv_32fc_t* bPtr = bVector;
190 2 unsigned int number = 0;
191
192
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
193 262142 *cPtr++ = (*aPtr++) + (*bPtr++);
194 }
195 2 }
196 #endif /* LV_HAVE_GENERIC */
197
198
199 #ifdef LV_HAVE_SSE
200 #include <xmmintrin.h>
201
202 2 static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
203 const lv_32fc_t* aVector,
204 const lv_32fc_t* bVector,
205 unsigned int num_points)
206 {
207 2 unsigned int number = 0;
208 2 const unsigned int halfPoints = num_points / 2;
209
210 2 lv_32fc_t* cPtr = cVector;
211 2 const lv_32fc_t* aPtr = aVector;
212 2 const lv_32fc_t* bPtr = bVector;
213
214 __m128 aVal, bVal, cVal;
215
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
216 131070 aVal = _mm_load_ps((float*)aPtr);
217 131070 bVal = _mm_load_ps((float*)bPtr);
218
219 131070 cVal = _mm_add_ps(aVal, bVal);
220
221 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
222
223 131070 aPtr += 2;
224 131070 bPtr += 2;
225 131070 cPtr += 2;
226 }
227
228 2 number = halfPoints * 2;
229
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (; number < num_points; number++) {
230 2 *cPtr++ = (*aPtr++) + (*bPtr++);
231 }
232 2 }
233 #endif /* LV_HAVE_SSE */
234
235
236 #ifdef LV_HAVE_NEON
237 #include <arm_neon.h>
238
239 static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
240 const lv_32fc_t* aVector,
241 const lv_32fc_t* bVector,
242 unsigned int num_points)
243 {
244 unsigned int number = 0;
245 const unsigned int halfPoints = num_points / 2;
246
247 lv_32fc_t* cPtr = cVector;
248 const lv_32fc_t* aPtr = aVector;
249 const lv_32fc_t* bPtr = bVector;
250 float32x4_t aVal, bVal, cVal;
251 for (number = 0; number < halfPoints; number++) {
252 // Load in to NEON registers
253 aVal = vld1q_f32((const float32_t*)(aPtr));
254 bVal = vld1q_f32((const float32_t*)(bPtr));
255 __VOLK_PREFETCH(aPtr + 2);
256 __VOLK_PREFETCH(bPtr + 2);
257
258 // vector add
259 cVal = vaddq_f32(aVal, bVal);
260 // Store the results back into the C container
261 vst1q_f32((float*)(cPtr), cVal);
262
263 aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
264 bPtr += 2;
265 cPtr += 2;
266 }
267
268 number = halfPoints * 2; // should be = num_points
269 for (; number < num_points; number++) {
270 *cPtr++ = (*aPtr++) + (*bPtr++);
271 }
272 }
273
274 #endif /* LV_HAVE_NEON */
275
276
277 #endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */
278