GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_s32fc_multiply_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 150 150 100.0%
Functions: 8 8 100.0%
Branches: 30 32 93.8%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_s32fc_multiply_32fc
12 *
13 * \b Overview
14 *
15 * Multiplies the input complex vector by a complex scalar and returns
16 * the results.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
21 * lv_32fc_t scalar, unsigned int num_points); \endcode
22 *
23 * \b Inputs
24 * \li aVector: The input vector to be multiplied.
25 * \li scalar The complex scalar to multiply against aVector.
26 * \li num_points: The number of complex values in aVector.
27 *
28 * \b Outputs
29 * \li cVector: The vector where the results will be stored.
30 *
31 * \b Example
32 * Generate points around the unit circle and shift the phase pi/3 rad.
33 * \code
34 * int N = 10;
35 * unsigned int alignment = volk_get_alignment();
36 * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
37 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
38 * lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f));
39 *
40 * float delta = 2.f*M_PI / (float)N;
41 * for(unsigned int ii = 0; ii < N/2; ++ii){
42 * // Generate points around the unit circle
43 * float real = std::cos(delta * (float)ii);
44 * float imag = std::sin(delta * (float)ii);
45 * in[ii] = lv_cmake(real, imag);
46 * in[ii+N/2] = lv_cmake(-real, -imag);
47 * }
48 *
49 * volk_32fc_s32fc_multiply_32fc(out, in, scalar, N);
50 *
51 * printf(" mag phase | mag phase\n");
52 * for(unsigned int ii = 0; ii < N; ++ii){
53 * printf("%+1.2f %+1.2f | %+1.2f %+1.2f\n",
54 * std::abs(in[ii]), std::arg(in[ii]),
55 * std::abs(out[ii]), std::arg(out[ii]));
56 * }
57 *
58 * volk_free(in);
59 * volk_free(out);
60 * \endcode
61 */
62
63 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
64 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
65
66 #include <float.h>
67 #include <inttypes.h>
68 #include <stdio.h>
69 #include <volk/volk_complex.h>
70
71 #if LV_HAVE_AVX && LV_HAVE_FMA
72 #include <immintrin.h>
73
74 2 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
75 const lv_32fc_t* aVector,
76 const lv_32fc_t scalar,
77 unsigned int num_points)
78 {
79 2 unsigned int number = 0;
80 2 unsigned int i = 0;
81 2 const unsigned int quarterPoints = num_points / 4;
82 2 unsigned int isodd = num_points & 3;
83 __m256 x, yl, yh, z, tmp1, tmp2;
84 2 lv_32fc_t* c = cVector;
85 2 const lv_32fc_t* a = aVector;
86
87 // Set up constant scalar vector
88 2 yl = _mm256_set1_ps(lv_creal(scalar));
89 2 yh = _mm256_set1_ps(lv_cimag(scalar));
90
91
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
92 65534 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
93
94 65534 tmp1 = x;
95
96 65534 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
97
98 65534 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
99
100 65534 z = _mm256_fmaddsub_ps(
101 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
102
103 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
104
105 65534 a += 4;
106 65534 c += 4;
107 }
108
109
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = num_points - isodd; i < num_points; i++) {
110 6 *c++ = (*a++) * scalar;
111 }
112 2 }
113 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
114
115 #ifdef LV_HAVE_AVX
116 #include <immintrin.h>
117
118 2 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
119 const lv_32fc_t* aVector,
120 const lv_32fc_t scalar,
121 unsigned int num_points)
122 {
123 2 unsigned int number = 0;
124 2 unsigned int i = 0;
125 2 const unsigned int quarterPoints = num_points / 4;
126 2 unsigned int isodd = num_points & 3;
127 __m256 x, yl, yh, z, tmp1, tmp2;
128 2 lv_32fc_t* c = cVector;
129 2 const lv_32fc_t* a = aVector;
130
131 // Set up constant scalar vector
132 2 yl = _mm256_set1_ps(lv_creal(scalar));
133 2 yh = _mm256_set1_ps(lv_cimag(scalar));
134
135
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
136 65534 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
137
138 65534 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
139
140 65534 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
141
142 65534 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
143
144 65534 z = _mm256_addsub_ps(tmp1,
145 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
146
147 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
148
149 65534 a += 4;
150 65534 c += 4;
151 }
152
153
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = num_points - isodd; i < num_points; i++) {
154 6 *c++ = (*a++) * scalar;
155 }
156 2 }
157 #endif /* LV_HAVE_AVX */
158
159 #ifdef LV_HAVE_SSE3
160 #include <pmmintrin.h>
161
162 2 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
163 const lv_32fc_t* aVector,
164 const lv_32fc_t scalar,
165 unsigned int num_points)
166 {
167 2 unsigned int number = 0;
168 2 const unsigned int halfPoints = num_points / 2;
169
170 __m128 x, yl, yh, z, tmp1, tmp2;
171 2 lv_32fc_t* c = cVector;
172 2 const lv_32fc_t* a = aVector;
173
174 // Set up constant scalar vector
175 2 yl = _mm_set_ps1(lv_creal(scalar));
176 2 yh = _mm_set_ps1(lv_cimag(scalar));
177
178
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
179
180 131070 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
181
182 131070 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
183
184 131070 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
185
186 131070 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
187
188 131070 z = _mm_addsub_ps(tmp1,
189 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
190
191 _mm_storeu_ps((float*)c, z); // Store the results back into the C container
192
193 131070 a += 2;
194 131070 c += 2;
195 }
196
197
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if ((num_points % 2) != 0) {
198 2 *c = (*a) * scalar;
199 }
200 2 }
201 #endif /* LV_HAVE_SSE */
202
203 #ifdef LV_HAVE_GENERIC
204
205 2 static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
206 const lv_32fc_t* aVector,
207 const lv_32fc_t scalar,
208 unsigned int num_points)
209 {
210 2 lv_32fc_t* cPtr = cVector;
211 2 const lv_32fc_t* aPtr = aVector;
212 2 unsigned int number = num_points;
213
214 // unwrap loop
215
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 while (number >= 8) {
216 32766 *cPtr++ = (*aPtr++) * scalar;
217 32766 *cPtr++ = (*aPtr++) * scalar;
218 32766 *cPtr++ = (*aPtr++) * scalar;
219 32766 *cPtr++ = (*aPtr++) * scalar;
220 32766 *cPtr++ = (*aPtr++) * scalar;
221 32766 *cPtr++ = (*aPtr++) * scalar;
222 32766 *cPtr++ = (*aPtr++) * scalar;
223 32766 *cPtr++ = (*aPtr++) * scalar;
224 32766 number -= 8;
225 }
226
227 // clean up any remaining
228
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 while (number-- > 0)
229 14 *cPtr++ = *aPtr++ * scalar;
230 2 }
231 #endif /* LV_HAVE_GENERIC */
232
233
234 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
235 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
236 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
237
238 #include <float.h>
239 #include <inttypes.h>
240 #include <stdio.h>
241 #include <volk/volk_complex.h>
242
243 #if LV_HAVE_AVX && LV_HAVE_FMA
244 #include <immintrin.h>
245
246 2 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
247 const lv_32fc_t* aVector,
248 const lv_32fc_t scalar,
249 unsigned int num_points)
250 {
251 2 unsigned int number = 0;
252 2 unsigned int i = 0;
253 2 const unsigned int quarterPoints = num_points / 4;
254 2 unsigned int isodd = num_points & 3;
255 __m256 x, yl, yh, z, tmp1, tmp2;
256 2 lv_32fc_t* c = cVector;
257 2 const lv_32fc_t* a = aVector;
258
259 // Set up constant scalar vector
260 2 yl = _mm256_set1_ps(lv_creal(scalar));
261 2 yh = _mm256_set1_ps(lv_cimag(scalar));
262
263
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
264 65534 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
265
266 65534 tmp1 = x;
267
268 65534 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
269
270 65534 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
271
272 65534 z = _mm256_fmaddsub_ps(
273 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
274
275 _mm256_store_ps((float*)c, z); // Store the results back into the C container
276
277 65534 a += 4;
278 65534 c += 4;
279 }
280
281
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = num_points - isodd; i < num_points; i++) {
282 6 *c++ = (*a++) * scalar;
283 }
284 2 }
285 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
286
287
288 #ifdef LV_HAVE_AVX
289 #include <immintrin.h>
290
291 2 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
292 const lv_32fc_t* aVector,
293 const lv_32fc_t scalar,
294 unsigned int num_points)
295 {
296 2 unsigned int number = 0;
297 2 unsigned int i = 0;
298 2 const unsigned int quarterPoints = num_points / 4;
299 2 unsigned int isodd = num_points & 3;
300 __m256 x, yl, yh, z, tmp1, tmp2;
301 2 lv_32fc_t* c = cVector;
302 2 const lv_32fc_t* a = aVector;
303
304 // Set up constant scalar vector
305 2 yl = _mm256_set1_ps(lv_creal(scalar));
306 2 yh = _mm256_set1_ps(lv_cimag(scalar));
307
308
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
309 65534 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
310
311 65534 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
312
313 65534 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
314
315 65534 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
316
317 65534 z = _mm256_addsub_ps(tmp1,
318 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
319
320 _mm256_store_ps((float*)c, z); // Store the results back into the C container
321
322 65534 a += 4;
323 65534 c += 4;
324 }
325
326
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = num_points - isodd; i < num_points; i++) {
327 6 *c++ = (*a++) * scalar;
328 }
329 2 }
330 #endif /* LV_HAVE_AVX */
331
332 #ifdef LV_HAVE_SSE3
333 #include <pmmintrin.h>
334
335 2 static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
336 const lv_32fc_t* aVector,
337 const lv_32fc_t scalar,
338 unsigned int num_points)
339 {
340 2 unsigned int number = 0;
341 2 const unsigned int halfPoints = num_points / 2;
342
343 __m128 x, yl, yh, z, tmp1, tmp2;
344 2 lv_32fc_t* c = cVector;
345 2 const lv_32fc_t* a = aVector;
346
347 // Set up constant scalar vector
348 2 yl = _mm_set_ps1(lv_creal(scalar));
349 2 yh = _mm_set_ps1(lv_cimag(scalar));
350
351
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (; number < halfPoints; number++) {
352
353 131070 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
354
355 131070 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
356
357 131070 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
358
359 131070 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
360
361 131070 z = _mm_addsub_ps(tmp1,
362 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
363
364 _mm_store_ps((float*)c, z); // Store the results back into the C container
365
366 131070 a += 2;
367 131070 c += 2;
368 }
369
370
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if ((num_points % 2) != 0) {
371 2 *c = (*a) * scalar;
372 }
373 2 }
374 #endif /* LV_HAVE_SSE */
375
376 #ifdef LV_HAVE_NEON
377 #include <arm_neon.h>
378
379 static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
380 const lv_32fc_t* aVector,
381 const lv_32fc_t scalar,
382 unsigned int num_points)
383 {
384 lv_32fc_t* cPtr = cVector;
385 const lv_32fc_t* aPtr = aVector;
386 unsigned int number = num_points;
387 unsigned int quarter_points = num_points / 4;
388
389 float32x4x2_t a_val, scalar_val;
390 float32x4x2_t tmp_imag;
391
392 scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
393 scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((float*)aPtr);
396 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
397 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
398
399 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
400 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
401
402 vst2q_f32((float*)cPtr, tmp_imag);
403 aPtr += 4;
404 cPtr += 4;
405 }
406
407 for (number = quarter_points * 4; number < num_points; number++) {
408 *cPtr++ = *aPtr++ * scalar;
409 }
410 }
411 #endif /* LV_HAVE_NEON */
412
413 #ifdef LV_HAVE_GENERIC
414
415 2 static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector,
416 const lv_32fc_t* aVector,
417 const lv_32fc_t scalar,
418 unsigned int num_points)
419 {
420 2 lv_32fc_t* cPtr = cVector;
421 2 const lv_32fc_t* aPtr = aVector;
422 2 unsigned int number = num_points;
423
424 // unwrap loop
425
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 while (number >= 8) {
426 32766 *cPtr++ = (*aPtr++) * scalar;
427 32766 *cPtr++ = (*aPtr++) * scalar;
428 32766 *cPtr++ = (*aPtr++) * scalar;
429 32766 *cPtr++ = (*aPtr++) * scalar;
430 32766 *cPtr++ = (*aPtr++) * scalar;
431 32766 *cPtr++ = (*aPtr++) * scalar;
432 32766 *cPtr++ = (*aPtr++) * scalar;
433 32766 *cPtr++ = (*aPtr++) * scalar;
434 32766 number -= 8;
435 }
436
437 // clean up any remaining
438
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 while (number-- > 0)
439 14 *cPtr++ = *aPtr++ * scalar;
440 2 }
441 #endif /* LV_HAVE_GENERIC */
442
443 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
444