GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_x2_multiply_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 127 127 100.0%
Functions: 9 9 100.0%
Branches: 26 28 92.9%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_x2_multiply_32fc
12 *
13 * \b Overview
14 *
15 * Multiplies two complex vectors and returns the complex result.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
20 * lv_32fc_t* bVector, unsigned int num_points); \endcode
21 *
22 * \b Inputs
23 * \li aVector: The first input vector of complex floats.
24 * \li bVector: The second input vector of complex floats.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li outputVector: The output vector complex floats.
29 *
30 * \b Example
31 * Mix two signals at f=0.3 and 0.1.
32 * \code
33 * int N = 10;
34 * unsigned int alignment = volk_get_alignment();
35 * lv_32fc_t* sig_1 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
36 * lv_32fc_t* sig_2 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
37 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
38 *
39 * for(unsigned int ii = 0; ii < N; ++ii){
40 * // Generate two tones
41 * float real_1 = std::cos(0.3f * (float)ii);
42 * float imag_1 = std::sin(0.3f * (float)ii);
43 * sig_1[ii] = lv_cmake(real_1, imag_1);
44 * float real_2 = std::cos(0.1f * (float)ii);
45 * float imag_2 = std::sin(0.1f * (float)ii);
46 * sig_2[ii] = lv_cmake(real_2, imag_2);
47 * }
48 *
49 * volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N);
50 * *
51 * volk_free(sig_1);
52 * volk_free(sig_2);
53 * volk_free(out);
54 * \endcode
55 */
56
57 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
58 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
59
60 #include <float.h>
61 #include <inttypes.h>
62 #include <stdio.h>
63 #include <volk/volk_complex.h>
64
65 #if LV_HAVE_AVX2 && LV_HAVE_FMA
66 #include <immintrin.h>
67 /*!
68 \brief Multiplies the two input complex vectors and stores their results in the third
69 vector \param cVector The vector where the results will be stored \param aVector One of
70 the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
71 num_points The number of complex values in aVector and bVector to be multiplied together
72 and stored into cVector
73 */
74 2 static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
75 const lv_32fc_t* aVector,
76 const lv_32fc_t* bVector,
77 unsigned int num_points)
78 {
79 2 unsigned int number = 0;
80 2 const unsigned int quarterPoints = num_points / 4;
81
82 2 lv_32fc_t* c = cVector;
83 2 const lv_32fc_t* a = aVector;
84 2 const lv_32fc_t* b = bVector;
85
86
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
87
88 const __m256 x =
89 65534 _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
90 const __m256 y =
91 65534 _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
92
93 65534 const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
94 65534 const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
95
96 65534 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
97
98 65534 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
99
100 65534 const __m256 z = _mm256_fmaddsub_ps(
101 x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
102
103 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
104
105 65534 a += 4;
106 65534 b += 4;
107 65534 c += 4;
108 }
109
110 2 number = quarterPoints * 4;
111
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
112 6 *c++ = (*a++) * (*b++);
113 }
114 2 }
115 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
116
117
118 #ifdef LV_HAVE_AVX
119 #include <immintrin.h>
120 #include <volk/volk_avx_intrinsics.h>
121
122 2 static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
123 const lv_32fc_t* aVector,
124 const lv_32fc_t* bVector,
125 unsigned int num_points)
126 {
127 2 unsigned int number = 0;
128 2 const unsigned int quarterPoints = num_points / 4;
129
130 __m256 x, y, z;
131 2 lv_32fc_t* c = cVector;
132 2 const lv_32fc_t* a = aVector;
133 2 const lv_32fc_t* b = bVector;
134
135
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
136 65534 x = _mm256_loadu_ps(
137 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
138 65534 y = _mm256_loadu_ps(
139 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
140 65534 z = _mm256_complexmul_ps(x, y);
141 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
142
143 65534 a += 4;
144 65534 b += 4;
145 65534 c += 4;
146 }
147
148 2 number = quarterPoints * 4;
149
150
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
151 6 *c++ = (*a++) * (*b++);
152 }
153 2 }
154 #endif /* LV_HAVE_AVX */
155
156
157 #ifdef LV_HAVE_SSE3
158 #include <pmmintrin.h>
159 #include <volk/volk_sse3_intrinsics.h>
160
161 2 static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
162 const lv_32fc_t* aVector,
163 const lv_32fc_t* bVector,
164 unsigned int num_points)
165 {
166 2 unsigned int number = 0;
167 2 const unsigned int halfPoints = num_points / 2;
168
169 __m128 x, y, z;
170 2 lv_32fc_t* c = cVector;
171 2 const lv_32fc_t* a = aVector;
172 2 const lv_32fc_t* b = bVector;
173
174
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
175 131070 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
176 131070 y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
177 131070 z = _mm_complexmul_ps(x, y);
178 _mm_storeu_ps((float*)c, z); // Store the results back into the C container
179
180 131070 a += 2;
181 131070 b += 2;
182 131070 c += 2;
183 }
184
185
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if ((num_points % 2) != 0) {
186 2 *c = (*a) * (*b);
187 }
188 2 }
189 #endif /* LV_HAVE_SSE */
190
191
192 #ifdef LV_HAVE_GENERIC
193
194 2 static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
195 const lv_32fc_t* aVector,
196 const lv_32fc_t* bVector,
197 unsigned int num_points)
198 {
199 2 lv_32fc_t* cPtr = cVector;
200 2 const lv_32fc_t* aPtr = aVector;
201 2 const lv_32fc_t* bPtr = bVector;
202 2 unsigned int number = 0;
203
204
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
205 262142 *cPtr++ = (*aPtr++) * (*bPtr++);
206 }
207 2 }
208 #endif /* LV_HAVE_GENERIC */
209
210
211 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
212 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
213 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
214
215 #include <float.h>
216 #include <inttypes.h>
217 #include <stdio.h>
218 #include <volk/volk_complex.h>
219
220 #if LV_HAVE_AVX2 && LV_HAVE_FMA
221 #include <immintrin.h>
222 /*!
223 \brief Multiplies the two input complex vectors and stores their results in the third
224 vector \param cVector The vector where the results will be stored \param aVector One of
225 the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
226 num_points The number of complex values in aVector and bVector to be multiplied together
227 and stored into cVector
228 */
229 2 static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
230 const lv_32fc_t* aVector,
231 const lv_32fc_t* bVector,
232 unsigned int num_points)
233 {
234 2 unsigned int number = 0;
235 2 const unsigned int quarterPoints = num_points / 4;
236
237 2 lv_32fc_t* c = cVector;
238 2 const lv_32fc_t* a = aVector;
239 2 const lv_32fc_t* b = bVector;
240
241
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
242
243 const __m256 x =
244 65534 _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
245 const __m256 y =
246 65534 _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
247
248 65534 const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
249 65534 const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
250
251 65534 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
252
253 65534 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
254
255 65534 const __m256 z = _mm256_fmaddsub_ps(
256 x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
257
258 _mm256_store_ps((float*)c, z); // Store the results back into the C container
259
260 65534 a += 4;
261 65534 b += 4;
262 65534 c += 4;
263 }
264
265 2 number = quarterPoints * 4;
266
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
267 6 *c++ = (*a++) * (*b++);
268 }
269 2 }
270 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
271
272
273 #ifdef LV_HAVE_AVX
274 #include <immintrin.h>
275 #include <volk/volk_avx_intrinsics.h>
276
277 2 static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
278 const lv_32fc_t* aVector,
279 const lv_32fc_t* bVector,
280 unsigned int num_points)
281 {
282 2 unsigned int number = 0;
283 2 const unsigned int quarterPoints = num_points / 4;
284
285 __m256 x, y, z;
286 2 lv_32fc_t* c = cVector;
287 2 const lv_32fc_t* a = aVector;
288 2 const lv_32fc_t* b = bVector;
289
290
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
291 65534 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
292 65534 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
293 65534 z = _mm256_complexmul_ps(x, y);
294 _mm256_store_ps((float*)c, z); // Store the results back into the C container
295
296 65534 a += 4;
297 65534 b += 4;
298 65534 c += 4;
299 }
300
301 2 number = quarterPoints * 4;
302
303
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
304 6 *c++ = (*a++) * (*b++);
305 }
306 2 }
307 #endif /* LV_HAVE_AVX */
308
309 #ifdef LV_HAVE_SSE3
310 #include <pmmintrin.h>
311 #include <volk/volk_sse3_intrinsics.h>
312
313 2 static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
314 const lv_32fc_t* aVector,
315 const lv_32fc_t* bVector,
316 unsigned int num_points)
317 {
318 2 unsigned int number = 0;
319 2 const unsigned int halfPoints = num_points / 2;
320
321 __m128 x, y, z;
322 2 lv_32fc_t* c = cVector;
323 2 const lv_32fc_t* a = aVector;
324 2 const lv_32fc_t* b = bVector;
325
326
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
327 131070 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
328 131070 y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
329 131070 z = _mm_complexmul_ps(x, y);
330 _mm_store_ps((float*)c, z); // Store the results back into the C container
331
332 131070 a += 2;
333 131070 b += 2;
334 131070 c += 2;
335 }
336
337
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if ((num_points % 2) != 0) {
338 2 *c = (*a) * (*b);
339 }
340 2 }
341 #endif /* LV_HAVE_SSE */
342
343
344 #ifdef LV_HAVE_GENERIC
345
346 2 static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
347 const lv_32fc_t* aVector,
348 const lv_32fc_t* bVector,
349 unsigned int num_points)
350 {
351 2 lv_32fc_t* cPtr = cVector;
352 2 const lv_32fc_t* aPtr = aVector;
353 2 const lv_32fc_t* bPtr = bVector;
354 2 unsigned int number = 0;
355
356
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
357 262142 *cPtr++ = (*aPtr++) * (*bPtr++);
358 }
359 2 }
360 #endif /* LV_HAVE_GENERIC */
361
362
363 #ifdef LV_HAVE_NEON
364 #include <arm_neon.h>
365
366 static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
367 const lv_32fc_t* aVector,
368 const lv_32fc_t* bVector,
369 unsigned int num_points)
370 {
371 lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
372 lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
373 unsigned int quarter_points = num_points / 4;
374 float32x4x2_t a_val, b_val, c_val;
375 float32x4x2_t tmp_real, tmp_imag;
376 unsigned int number = 0;
377
378 for (number = 0; number < quarter_points; ++number) {
379 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
380 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
381 __VOLK_PREFETCH(a_ptr + 4);
382 __VOLK_PREFETCH(b_ptr + 4);
383
384 // multiply the real*real and imag*imag to get real result
385 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
386 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
387 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
388 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
389
390 // Multiply cross terms to get the imaginary result
391 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
392 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
393 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
394 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
395
396 // store the results
397 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
398 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
399 vst2q_f32((float*)cVector, c_val);
400
401 a_ptr += 4;
402 b_ptr += 4;
403 cVector += 4;
404 }
405
406 for (number = quarter_points * 4; number < num_points; number++) {
407 *cVector++ = (*a_ptr++) * (*b_ptr++);
408 }
409 }
410 #endif /* LV_HAVE_NEON */
411
412
413 #ifdef LV_HAVE_NEON
414
415 static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
416 const lv_32fc_t* aVector,
417 const lv_32fc_t* bVector,
418 unsigned int num_points)
419 {
420 lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
421 lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
422 unsigned int quarter_points = num_points / 4;
423 float32x4x2_t a_val, b_val;
424 float32x4x2_t tmp_imag;
425 unsigned int number = 0;
426
427 for (number = 0; number < quarter_points; ++number) {
428 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
429 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
430 __VOLK_PREFETCH(a_ptr + 4);
431 __VOLK_PREFETCH(b_ptr + 4);
432
433 // do the first multiply
434 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
435 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
436
437 // use multiply accumulate/subtract to get result
438 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
439 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
440
441 // store
442 vst2q_f32((float*)cVector, tmp_imag);
443 // increment pointers
444 a_ptr += 4;
445 b_ptr += 4;
446 cVector += 4;
447 }
448
449 for (number = quarter_points * 4; number < num_points; number++) {
450 *cVector++ = (*a_ptr++) * (*b_ptr++);
451 }
452 }
453 #endif /* LV_HAVE_NEON */
454
455
456 #ifdef LV_HAVE_NEONV7
457
458 extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
459 const lv_32fc_t* aVector,
460 const lv_32fc_t* bVector,
461 unsigned int num_points);
462 #endif /* LV_HAVE_NEONV7 */
463
464
465 #ifdef LV_HAVE_ORC
466
467 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
468 const lv_32fc_t* aVector,
469 const lv_32fc_t* bVector,
470 unsigned int num_points);
471
472 2 static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
473 const lv_32fc_t* aVector,
474 const lv_32fc_t* bVector,
475 unsigned int num_points)
476 {
477 2 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
478 2 }
479
480 #endif /* LV_HAVE_ORC */
481
482 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
483