GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 98 98 100.0%
Functions: 5 5 100.0%
Branches: 18 20 90.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
12 *
13 * \b Overview
14 *
15 * Conjugate the input complex vector, multiply them by a complex scalar,
16 * add the another input complex vector and returns the results.
17 *
18 * c[i] = a[i] + conj(b[i]) * scalar
19 *
20 * <b>Dispatcher Prototype</b>
21 * \code
22 * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const
23 * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int
24 * num_points); \endcode
25 *
26 * \b Inputs
27 * \li aVector: The input vector to be added.
28 * \li bVector: The input vector to be conjugate and multiplied.
29 * \li scalar: The complex scalar to multiply against conjugated bVector.
30 * \li num_points: The number of complex values in aVector and bVector to be conjugate,
31 * multiplied and stored into cVector.
32 *
33 * \b Outputs
34 * \li cVector: The vector where the results will be stored.
35 *
36 * \b Example
37 * Calculate coefficients.
38 *
39 * \code
40 * int n_filter = 2 * N + 1;
41 * unsigned int alignment = volk_get_alignment();
42 *
43 * // state is a queue of input IQ data.
44 * lv_32fc_t* state = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
45 * // weight and next one.
46 * lv_32fc_t* weight = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
47 * lv_32fc_t* next = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
48 * ...
49 * // push back input IQ data into state.
50 * foo_push_back_queue(state, input);
51 *
52 * // get filtered output.
53 * lv_32fc_t output = lv_cmake(0.f,0.f);
54 * for (int i = 0; i < n_filter; i++) {
55 * output += state[i] * weight[i];
56 * }
57 *
58 * // update weight using output.
59 * float real = lv_creal(output) * (1.0 - std::norm(output)) * MU;
60 * lv_32fc_t factor = lv_cmake(real, 0.f);
61 * volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(next, weight, state, factor, n_filter);
62 * lv_32fc_t *tmp = next;
63 * next = weight;
64 * weight = tmp;
65 * weight[N + 1] = lv_cmake(lv_creal(weight[N + 1]), 0.f);
66 * ...
67 * volk_free(state);
68 * volk_free(weight);
69 * volk_free(next);
70 * \endcode
71 */
72
73 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
74 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
75
76 #include <float.h>
77 #include <inttypes.h>
78 #include <stdio.h>
79 #include <volk/volk_complex.h>
80
81
82 #ifdef LV_HAVE_GENERIC
83
84 static inline void
85 2 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
86 const lv_32fc_t* aVector,
87 const lv_32fc_t* bVector,
88 const lv_32fc_t scalar,
89 unsigned int num_points)
90 {
91 2 const lv_32fc_t* aPtr = aVector;
92 2 const lv_32fc_t* bPtr = bVector;
93 2 lv_32fc_t* cPtr = cVector;
94 2 unsigned int number = num_points;
95
96 // unwrap loop
97
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 while (number >= 8) {
98 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
99 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
100 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
101 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
102 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
103 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
104 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
105 32766 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
106 32766 number -= 8;
107 }
108
109 // clean up any remaining
110
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 while (number-- > 0) {
111 14 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
112 }
113 2 }
114 #endif /* LV_HAVE_GENERIC */
115
116
117 #ifdef LV_HAVE_AVX
118 #include <immintrin.h>
119 #include <volk/volk_avx_intrinsics.h>
120
121 static inline void
122 2 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
123 const lv_32fc_t* aVector,
124 const lv_32fc_t* bVector,
125 const lv_32fc_t scalar,
126 unsigned int num_points)
127 {
128 2 unsigned int number = 0;
129 2 unsigned int i = 0;
130 2 const unsigned int quarterPoints = num_points / 4;
131 2 unsigned int isodd = num_points & 3;
132
133 __m256 x, y, s, z;
134 2 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
135
136 2 const lv_32fc_t* a = aVector;
137 2 const lv_32fc_t* b = bVector;
138 2 lv_32fc_t* c = cVector;
139
140 // Set up constant scalar vector
141 2 s = _mm256_loadu_ps((float*)v_scalar);
142
143
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
144 65534 x = _mm256_loadu_ps((float*)b);
145 65534 y = _mm256_loadu_ps((float*)a);
146 65534 z = _mm256_complexconjugatemul_ps(s, x);
147 65534 z = _mm256_add_ps(y, z);
148 _mm256_storeu_ps((float*)c, z);
149
150 65534 a += 4;
151 65534 b += 4;
152 65534 c += 4;
153 }
154
155
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = num_points - isodd; i < num_points; i++) {
156 6 *c++ = (*a++) + lv_conj(*b++) * scalar;
157 }
158 2 }
159 #endif /* LV_HAVE_AVX */
160
161
162 #ifdef LV_HAVE_SSE3
163 #include <pmmintrin.h>
164 #include <volk/volk_sse3_intrinsics.h>
165
166 static inline void
167 2 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
168 const lv_32fc_t* aVector,
169 const lv_32fc_t* bVector,
170 const lv_32fc_t scalar,
171 unsigned int num_points)
172 {
173 2 unsigned int number = 0;
174 2 const unsigned int halfPoints = num_points / 2;
175
176 __m128 x, y, s, z;
177 2 lv_32fc_t v_scalar[2] = { scalar, scalar };
178
179 2 const lv_32fc_t* a = aVector;
180 2 const lv_32fc_t* b = bVector;
181 2 lv_32fc_t* c = cVector;
182
183 // Set up constant scalar vector
184 2 s = _mm_loadu_ps((float*)v_scalar);
185
186
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
187 131070 x = _mm_loadu_ps((float*)b);
188 131070 y = _mm_loadu_ps((float*)a);
189 131070 z = _mm_complexconjugatemul_ps(s, x);
190 131070 z = _mm_add_ps(y, z);
191 _mm_storeu_ps((float*)c, z);
192
193 131070 a += 2;
194 131070 b += 2;
195 131070 c += 2;
196 }
197
198
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if ((num_points % 2) != 0) {
199 2 *c = *a + lv_conj(*b) * scalar;
200 }
201 2 }
202 #endif /* LV_HAVE_SSE */
203
204
205 #ifdef LV_HAVE_AVX
206 #include <immintrin.h>
207 #include <volk/volk_avx_intrinsics.h>
208
209 static inline void
210 2 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
211 const lv_32fc_t* aVector,
212 const lv_32fc_t* bVector,
213 const lv_32fc_t scalar,
214 unsigned int num_points)
215 {
216 2 unsigned int number = 0;
217 2 unsigned int i = 0;
218 2 const unsigned int quarterPoints = num_points / 4;
219 2 unsigned int isodd = num_points & 3;
220
221 __m256 x, y, s, z;
222 2 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
223
224 2 const lv_32fc_t* a = aVector;
225 2 const lv_32fc_t* b = bVector;
226 2 lv_32fc_t* c = cVector;
227
228 // Set up constant scalar vector
229 2 s = _mm256_loadu_ps((float*)v_scalar);
230
231
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
232 65534 x = _mm256_load_ps((float*)b);
233 65534 y = _mm256_load_ps((float*)a);
234 65534 z = _mm256_complexconjugatemul_ps(s, x);
235 65534 z = _mm256_add_ps(y, z);
236 _mm256_store_ps((float*)c, z);
237
238 65534 a += 4;
239 65534 b += 4;
240 65534 c += 4;
241 }
242
243
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = num_points - isodd; i < num_points; i++) {
244 6 *c++ = (*a++) + lv_conj(*b++) * scalar;
245 }
246 2 }
247 #endif /* LV_HAVE_AVX */
248
249
250 #ifdef LV_HAVE_SSE3
251 #include <pmmintrin.h>
252 #include <volk/volk_sse3_intrinsics.h>
253
254 static inline void
255 2 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
256 const lv_32fc_t* aVector,
257 const lv_32fc_t* bVector,
258 const lv_32fc_t scalar,
259 unsigned int num_points)
260 {
261 2 unsigned int number = 0;
262 2 const unsigned int halfPoints = num_points / 2;
263
264 __m128 x, y, s, z;
265 2 lv_32fc_t v_scalar[2] = { scalar, scalar };
266
267 2 const lv_32fc_t* a = aVector;
268 2 const lv_32fc_t* b = bVector;
269 2 lv_32fc_t* c = cVector;
270
271 // Set up constant scalar vector
272 2 s = _mm_loadu_ps((float*)v_scalar);
273
274
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
275 131070 x = _mm_load_ps((float*)b);
276 131070 y = _mm_load_ps((float*)a);
277 131070 z = _mm_complexconjugatemul_ps(s, x);
278 131070 z = _mm_add_ps(y, z);
279 _mm_store_ps((float*)c, z);
280
281 131070 a += 2;
282 131070 b += 2;
283 131070 c += 2;
284 }
285
286
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if ((num_points % 2) != 0) {
287 2 *c = *a + lv_conj(*b) * scalar;
288 }
289 2 }
290 #endif /* LV_HAVE_SSE */
291
292
293 #ifdef LV_HAVE_NEON
294 #include <arm_neon.h>
295
296 static inline void
297 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
298 const lv_32fc_t* aVector,
299 const lv_32fc_t* bVector,
300 const lv_32fc_t scalar,
301 unsigned int num_points)
302 {
303 const lv_32fc_t* bPtr = bVector;
304 const lv_32fc_t* aPtr = aVector;
305 lv_32fc_t* cPtr = cVector;
306 unsigned int number = num_points;
307 unsigned int quarter_points = num_points / 4;
308
309 float32x4x2_t a_val, b_val, c_val, scalar_val;
310 float32x4x2_t tmp_val;
311
312 scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
313 scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
314
315 for (number = 0; number < quarter_points; ++number) {
316 a_val = vld2q_f32((float*)aPtr);
317 b_val = vld2q_f32((float*)bPtr);
318 b_val.val[1] = vnegq_f32(b_val.val[1]);
319 __VOLK_PREFETCH(aPtr + 8);
320 __VOLK_PREFETCH(bPtr + 8);
321
322 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
323 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
324
325 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
326 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
327
328 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
329 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
330
331 vst2q_f32((float*)cPtr, c_val);
332
333 aPtr += 4;
334 bPtr += 4;
335 cPtr += 4;
336 }
337
338 for (number = quarter_points * 4; number < num_points; number++) {
339 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
340 }
341 }
342 #endif /* LV_HAVE_NEON */
343
344 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */
345