GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 314 314 100.0%
Functions: 7 7 100.0%
Branches: 67 76 88.2%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_s32fc_x2_rotator_32fc
12 *
13 * \b Overview
14 *
15 * Rotate input vector at fixed rate per sample from initial phase
16 * offset.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
21 * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li inVector: Vector to be rotated.
25 * \li phase_inc: rotational velocity.
26 * \li phase: initial phase offset.
27 * \li num_points: The number of values in inVector to be rotated and stored into
28 * outVector.
29 *
30 * \b Outputs
31 * \li outVector: The vector where the results will be stored.
32 *
33 * \b Example
34 * Generate a tone at f=0.3 (normalized frequency) and use the rotator with
35 * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC
36 * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation.
37 * \code
38 * int N = 10;
39 * unsigned int alignment = volk_get_alignment();
40 * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
41 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
42 *
43 * for(unsigned int ii = 0; ii < N; ++ii){
44 * // Generate a tone at f=0.3
45 * float real = std::cos(0.3f * (float)ii);
46 * float imag = std::sin(0.3f * (float)ii);
47 * in[ii] = lv_cmake(real, imag);
48 * }
49 * // The oscillator rotates at f=0.1
50 * float frequency = 0.1f;
51 * lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency));
52 * lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase)
53 *
54 * // rotate so the output is a tone at f=0.4
55 * volk_32fc_s32fc_x2_rotator_32fc(out, in, phase_increment, &phase, N);
56 *
57 * // print results for inspection
58 * for(unsigned int ii = 0; ii < N; ++ii){
59 * printf("out[%u] = %+1.2f %+1.2fj\n",
60 * ii, lv_creal(out[ii]), lv_cimag(out[ii]));
61 * }
62 *
63 * volk_free(in);
64 * volk_free(out);
65 * \endcode
66 */
67
68 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
69 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
70
71
72 #include <math.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <volk/volk_complex.h>
76 #define ROTATOR_RELOAD 512
77 #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78 #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79
80
81 #ifdef LV_HAVE_GENERIC
82
83 6 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
84 const lv_32fc_t* inVector,
85 const lv_32fc_t phase_inc,
86 lv_32fc_t* phase,
87 unsigned int num_points)
88 {
89 6 unsigned int i = 0;
90 6 int j = 0;
91
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 6 times.
516 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
92
2/2
✓ Branch 0 taken 261120 times.
✓ Branch 1 taken 510 times.
261630 for (j = 0; j < ROTATOR_RELOAD; ++j) {
93 261120 *outVector++ = *inVector++ * (*phase);
94 261120 (*phase) *= phase_inc;
95 }
96
97 510 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
98 }
99
2/2
✓ Branch 0 taken 1034 times.
✓ Branch 1 taken 6 times.
1040 for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
100 1034 *outVector++ = *inVector++ * (*phase);
101 1034 (*phase) *= phase_inc;
102 }
103
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (i) {
104 // Make sure, we normalize phase on every call!
105 6 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106 }
107 6 }
108
109 #endif /* LV_HAVE_GENERIC */
110
111
112 #ifdef LV_HAVE_NEON
113 #include <arm_neon.h>
114 #include <volk/volk_neon_intrinsics.h>
115
116 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
117 const lv_32fc_t* inVector,
118 const lv_32fc_t phase_inc,
119 lv_32fc_t* phase,
120 unsigned int num_points)
121
122 {
123 lv_32fc_t* outputVectorPtr = outVector;
124 const lv_32fc_t* inputVectorPtr = inVector;
125 lv_32fc_t incr = 1;
126 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127 float32x4x2_t input_vec;
128 float32x4x2_t output_vec;
129
130 unsigned int i = 0, j = 0;
131 // const unsigned int quarter_points = num_points / 4;
132
133 for (i = 0; i < 4; ++i) {
134 phasePtr[i] *= incr;
135 incr *= (phase_inc);
136 }
137
138 // Notice that incr has be incremented in the previous loop
139 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140 const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
141 float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
142
143 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
144 for (j = 0; j < ROTATOR_RELOAD_4; j++) {
145 input_vec = vld2q_f32((float*)inputVectorPtr);
146 // Prefetch next one, speeds things up
147 __VOLK_PREFETCH(inputVectorPtr + 4);
148 // Rotate
149 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
150 // Increase phase
151 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
152 // Store output
153 vst2q_f32((float*)outputVectorPtr, output_vec);
154
155 outputVectorPtr += 4;
156 inputVectorPtr += 4;
157 }
158 // normalize phase so magnitude doesn't grow because of
159 // floating point rounding error
160 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
161 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
162 // Multiply complex with real
163 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
165 }
166
167 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
168 input_vec = vld2q_f32((float*)inputVectorPtr);
169 // Prefetch next one, speeds things up
170 __VOLK_PREFETCH(inputVectorPtr + 4);
171 // Rotate
172 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
173 // Increase phase
174 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
175 // Store output
176 vst2q_f32((float*)outputVectorPtr, output_vec);
177
178 outputVectorPtr += 4;
179 inputVectorPtr += 4;
180 }
181 // if(i) == true means we looped above
182 if (i) {
183 // normalize phase so magnitude doesn't grow because of
184 // floating point rounding error
185 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
186 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
187 // Multiply complex with real
188 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
190 }
191 // Store current phase
192 vst2q_f32((float*)phasePtr, phase_vec);
193
194 // Deal with the rest
195 for (i = 0; i < num_points % 4; i++) {
196 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197 phasePtr[0] *= (phase_inc);
198 }
199
200 // For continuous phase next time we need to call this function
201 (*phase) = phasePtr[0];
202 }
203
204 #endif /* LV_HAVE_NEON */
205
206
207 #ifdef LV_HAVE_SSE4_1
208 #include <smmintrin.h>
209
210 2 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
211 const lv_32fc_t* inVector,
212 const lv_32fc_t phase_inc,
213 lv_32fc_t* phase,
214 unsigned int num_points)
215 {
216 2 lv_32fc_t* cPtr = outVector;
217 2 const lv_32fc_t* aPtr = inVector;
218 2 lv_32fc_t incr = 1;
219 2 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
220
221 2 unsigned int i, j = 0;
222
223
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2 times.
6 for (i = 0; i < 2; ++i) {
224 4 phase_Ptr[i] *= incr;
225 4 incr *= (phase_inc);
226 }
227
228 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229
230 2 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
231 2 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
232
233
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
234
2/2
✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 510 times.
131070 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
235
236 130560 aVal = _mm_load_ps((float*)aPtr);
237
238 130560 yl = _mm_moveldup_ps(phase_Val);
239 130560 yh = _mm_movehdup_ps(phase_Val);
240 130560 ylp = _mm_moveldup_ps(inc_Val);
241 130560 yhp = _mm_movehdup_ps(inc_Val);
242
243 130560 tmp1 = _mm_mul_ps(aVal, yl);
244 130560 tmp1p = _mm_mul_ps(phase_Val, ylp);
245
246 130560 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
247 130560 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
248 130560 tmp2 = _mm_mul_ps(aVal, yh);
249 130560 tmp2p = _mm_mul_ps(phase_Val, yhp);
250
251 130560 z = _mm_addsub_ps(tmp1, tmp2);
252 130560 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
253
254 _mm_store_ps((float*)cPtr, z);
255
256 130560 aPtr += 2;
257 130560 cPtr += 2;
258 }
259 510 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
260 510 tmp2 = _mm_hadd_ps(tmp1, tmp1);
261 510 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
262 510 tmp2 = _mm_sqrt_ps(tmp1);
263 510 phase_Val = _mm_div_ps(phase_Val, tmp2);
264 }
265
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
266 510 aVal = _mm_load_ps((float*)aPtr);
267
268 510 yl = _mm_moveldup_ps(phase_Val);
269 510 yh = _mm_movehdup_ps(phase_Val);
270 510 ylp = _mm_moveldup_ps(inc_Val);
271 510 yhp = _mm_movehdup_ps(inc_Val);
272
273 510 tmp1 = _mm_mul_ps(aVal, yl);
274
275 510 tmp1p = _mm_mul_ps(phase_Val, ylp);
276
277 510 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
278 510 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
279 510 tmp2 = _mm_mul_ps(aVal, yh);
280 510 tmp2p = _mm_mul_ps(phase_Val, yhp);
281
282 510 z = _mm_addsub_ps(tmp1, tmp2);
283 510 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
284
285 _mm_store_ps((float*)cPtr, z);
286
287 510 aPtr += 2;
288 510 cPtr += 2;
289 }
290
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (i) {
291 2 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
292 2 tmp2 = _mm_hadd_ps(tmp1, tmp1);
293 2 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
294 2 tmp2 = _mm_sqrt_ps(tmp1);
295 2 phase_Val = _mm_div_ps(phase_Val, tmp2);
296 }
297
298 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
299
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points & 1) {
300 2 *cPtr++ = *aPtr++ * phase_Ptr[0];
301 2 phase_Ptr[0] *= (phase_inc);
302 }
303
304 2 (*phase) = phase_Ptr[0];
305 2 }
306
307 #endif /* LV_HAVE_SSE4_1 for aligned */
308
309
310 #ifdef LV_HAVE_SSE4_1
311 #include <smmintrin.h>
312
313 2 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
314 const lv_32fc_t* inVector,
315 const lv_32fc_t phase_inc,
316 lv_32fc_t* phase,
317 unsigned int num_points)
318 {
319 2 lv_32fc_t* cPtr = outVector;
320 2 const lv_32fc_t* aPtr = inVector;
321 2 lv_32fc_t incr = 1;
322 2 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
323
324 2 unsigned int i, j = 0;
325
326
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2 times.
6 for (i = 0; i < 2; ++i) {
327 4 phase_Ptr[i] *= incr;
328 4 incr *= (phase_inc);
329 }
330
331 /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
332 printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
333 printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
334 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
335
336 2 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
337 2 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
338
339
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
340
2/2
✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 510 times.
131070 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
341
342 130560 aVal = _mm_loadu_ps((float*)aPtr);
343
344 130560 yl = _mm_moveldup_ps(phase_Val);
345 130560 yh = _mm_movehdup_ps(phase_Val);
346 130560 ylp = _mm_moveldup_ps(inc_Val);
347 130560 yhp = _mm_movehdup_ps(inc_Val);
348
349 130560 tmp1 = _mm_mul_ps(aVal, yl);
350 130560 tmp1p = _mm_mul_ps(phase_Val, ylp);
351
352 130560 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
353 130560 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
354 130560 tmp2 = _mm_mul_ps(aVal, yh);
355 130560 tmp2p = _mm_mul_ps(phase_Val, yhp);
356
357 130560 z = _mm_addsub_ps(tmp1, tmp2);
358 130560 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
359
360 _mm_storeu_ps((float*)cPtr, z);
361
362 130560 aPtr += 2;
363 130560 cPtr += 2;
364 }
365 510 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
366 510 tmp2 = _mm_hadd_ps(tmp1, tmp1);
367 510 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
368 510 tmp2 = _mm_sqrt_ps(tmp1);
369 510 phase_Val = _mm_div_ps(phase_Val, tmp2);
370 }
371
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
372 510 aVal = _mm_loadu_ps((float*)aPtr);
373
374 510 yl = _mm_moveldup_ps(phase_Val);
375 510 yh = _mm_movehdup_ps(phase_Val);
376 510 ylp = _mm_moveldup_ps(inc_Val);
377 510 yhp = _mm_movehdup_ps(inc_Val);
378
379 510 tmp1 = _mm_mul_ps(aVal, yl);
380
381 510 tmp1p = _mm_mul_ps(phase_Val, ylp);
382
383 510 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
384 510 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
385 510 tmp2 = _mm_mul_ps(aVal, yh);
386 510 tmp2p = _mm_mul_ps(phase_Val, yhp);
387
388 510 z = _mm_addsub_ps(tmp1, tmp2);
389 510 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
390
391 _mm_storeu_ps((float*)cPtr, z);
392
393 510 aPtr += 2;
394 510 cPtr += 2;
395 }
396
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (i) {
397 2 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
398 2 tmp2 = _mm_hadd_ps(tmp1, tmp1);
399 2 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
400 2 tmp2 = _mm_sqrt_ps(tmp1);
401 2 phase_Val = _mm_div_ps(phase_Val, tmp2);
402 }
403
404 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
405
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_points & 1) {
406 2 *cPtr++ = *aPtr++ * phase_Ptr[0];
407 2 phase_Ptr[0] *= (phase_inc);
408 }
409
410 2 (*phase) = phase_Ptr[0];
411 2 }
412
413 #endif /* LV_HAVE_SSE4_1 */
414
415
416 #ifdef LV_HAVE_AVX
417 #include <immintrin.h>
418 #include <volk/volk_avx_intrinsics.h>
419
420 2 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
421 const lv_32fc_t* inVector,
422 const lv_32fc_t phase_inc,
423 lv_32fc_t* phase,
424 unsigned int num_points)
425 {
426 2 lv_32fc_t* cPtr = outVector;
427 2 const lv_32fc_t* aPtr = inVector;
428 2 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
429 2 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
430
431 2 unsigned int i, j = 0;
432
433
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (i = 0; i < 4; ++i) {
434 8 phase_Ptr[i] *= incr;
435 8 incr *= (phase_inc);
436 }
437
438 __m256 aVal, phase_Val, z;
439
440 2 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
441
442 2 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
443 lv_creal(incr),
444 lv_cimag(incr),
445 lv_creal(incr),
446 lv_cimag(incr),
447 lv_creal(incr),
448 lv_cimag(incr),
449 lv_creal(incr));
450
451
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
452
2/2
✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.
65790 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
453
454 65280 aVal = _mm256_load_ps((float*)aPtr);
455
456 65280 z = _mm256_complexmul_ps(aVal, phase_Val);
457 65280 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
458
459 _mm256_store_ps((float*)cPtr, z);
460
461 65280 aPtr += 4;
462 65280 cPtr += 4;
463 }
464 510 phase_Val = _mm256_normalize_ps(phase_Val);
465 }
466
467
2/2
✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.
256 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
468 254 aVal = _mm256_load_ps((float*)aPtr);
469
470 254 z = _mm256_complexmul_ps(aVal, phase_Val);
471 254 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
472
473 _mm256_store_ps((float*)cPtr, z);
474
475 254 aPtr += 4;
476 254 cPtr += 4;
477 }
478
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (i) {
479 2 phase_Val = _mm256_normalize_ps(phase_Val);
480 }
481
482 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
483 2 (*phase) = phase_Ptr[0];
484 2 volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
485 2 }
486
487 #endif /* LV_HAVE_AVX for aligned */
488
489
490 #ifdef LV_HAVE_AVX
491 #include <immintrin.h>
492 #include <volk/volk_avx_intrinsics.h>
493
494 2 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
495 const lv_32fc_t* inVector,
496 const lv_32fc_t phase_inc,
497 lv_32fc_t* phase,
498 unsigned int num_points)
499 {
500 2 lv_32fc_t* cPtr = outVector;
501 2 const lv_32fc_t* aPtr = inVector;
502 2 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
503 2 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
504
505 2 unsigned int i, j = 0;
506
507
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (i = 0; i < 4; ++i) {
508 8 phase_Ptr[i] *= incr;
509 8 incr *= (phase_inc);
510 }
511
512 __m256 aVal, phase_Val, z;
513
514 2 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
515
516 2 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
517 lv_creal(incr),
518 lv_cimag(incr),
519 lv_creal(incr),
520 lv_cimag(incr),
521 lv_creal(incr),
522 lv_cimag(incr),
523 lv_creal(incr));
524
525
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
526
2/2
✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.
65790 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
527
528 65280 aVal = _mm256_loadu_ps((float*)aPtr);
529
530 65280 z = _mm256_complexmul_ps(aVal, phase_Val);
531 65280 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
532
533 _mm256_storeu_ps((float*)cPtr, z);
534
535 65280 aPtr += 4;
536 65280 cPtr += 4;
537 }
538 510 phase_Val = _mm256_normalize_ps(phase_Val);
539 }
540
541
2/2
✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.
256 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
542 254 aVal = _mm256_loadu_ps((float*)aPtr);
543
544 254 z = _mm256_complexmul_ps(aVal, phase_Val);
545 254 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
546
547 _mm256_storeu_ps((float*)cPtr, z);
548
549 254 aPtr += 4;
550 254 cPtr += 4;
551 }
552
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (i) {
553 2 phase_Val = _mm256_normalize_ps(phase_Val);
554 }
555
556 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
557 2 (*phase) = phase_Ptr[0];
558 2 volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
559 2 }
560
561 #endif /* LV_HAVE_AVX */
562
563 #if LV_HAVE_AVX && LV_HAVE_FMA
564 #include <immintrin.h>
565
566 2 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
567 const lv_32fc_t* inVector,
568 const lv_32fc_t phase_inc,
569 lv_32fc_t* phase,
570 unsigned int num_points)
571 {
572 2 lv_32fc_t* cPtr = outVector;
573 2 const lv_32fc_t* aPtr = inVector;
574 2 lv_32fc_t incr = 1;
575 __VOLK_ATTR_ALIGNED(32)
576 2 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
577
578 2 unsigned int i, j = 0;
579
580
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (i = 0; i < 4; ++i) {
581 8 phase_Ptr[i] *= incr;
582 8 incr *= (phase_inc);
583 }
584
585 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
586
587 2 phase_Val = _mm256_load_ps((float*)phase_Ptr);
588 2 inc_Val = _mm256_set_ps(lv_cimag(incr),
589 lv_creal(incr),
590 lv_cimag(incr),
591 lv_creal(incr),
592 lv_cimag(incr),
593 lv_creal(incr),
594 lv_cimag(incr),
595 lv_creal(incr));
596
597
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
598
2/2
✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.
65790 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
599
600 65280 aVal = _mm256_load_ps((float*)aPtr);
601
602 65280 yl = _mm256_moveldup_ps(phase_Val);
603 65280 yh = _mm256_movehdup_ps(phase_Val);
604 65280 ylp = _mm256_moveldup_ps(inc_Val);
605 65280 yhp = _mm256_movehdup_ps(inc_Val);
606
607 65280 tmp1 = aVal;
608 65280 tmp1p = phase_Val;
609
610 65280 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
611 65280 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
612 65280 tmp2 = _mm256_mul_ps(aVal, yh);
613 65280 tmp2p = _mm256_mul_ps(phase_Val, yhp);
614
615 65280 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
616 65280 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
617
618 _mm256_store_ps((float*)cPtr, z);
619
620 65280 aPtr += 4;
621 65280 cPtr += 4;
622 }
623 510 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
624 510 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
625 510 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
626 510 tmp2 = _mm256_sqrt_ps(tmp1);
627 510 phase_Val = _mm256_div_ps(phase_Val, tmp2);
628 }
629
2/2
✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.
256 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
630 254 aVal = _mm256_load_ps((float*)aPtr);
631
632 254 yl = _mm256_moveldup_ps(phase_Val);
633 254 yh = _mm256_movehdup_ps(phase_Val);
634 254 ylp = _mm256_moveldup_ps(inc_Val);
635 254 yhp = _mm256_movehdup_ps(inc_Val);
636
637 254 tmp1 = aVal;
638 254 tmp1p = phase_Val;
639
640 254 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
641 254 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
642 254 tmp2 = _mm256_mul_ps(aVal, yh);
643 254 tmp2p = _mm256_mul_ps(phase_Val, yhp);
644
645 254 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
646 254 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
647
648 _mm256_store_ps((float*)cPtr, z);
649
650 254 aPtr += 4;
651 254 cPtr += 4;
652 }
653
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (i) {
654 2 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
655 2 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
656 2 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
657 2 tmp2 = _mm256_sqrt_ps(tmp1);
658 2 phase_Val = _mm256_div_ps(phase_Val, tmp2);
659 }
660
661 _mm256_store_ps((float*)phase_Ptr, phase_Val);
662
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = 0; i < num_points % 4; ++i) {
663 6 *cPtr++ = *aPtr++ * phase_Ptr[0];
664 6 phase_Ptr[0] *= (phase_inc);
665 }
666
667 2 (*phase) = phase_Ptr[0];
668 2 }
669
670 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
671
672 #if LV_HAVE_AVX && LV_HAVE_FMA
673 #include <immintrin.h>
674
675 2 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
676 const lv_32fc_t* inVector,
677 const lv_32fc_t phase_inc,
678 lv_32fc_t* phase,
679 unsigned int num_points)
680 {
681 2 lv_32fc_t* cPtr = outVector;
682 2 const lv_32fc_t* aPtr = inVector;
683 2 lv_32fc_t incr = 1;
684 2 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
685
686 2 unsigned int i, j = 0;
687
688
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (i = 0; i < 4; ++i) {
689 8 phase_Ptr[i] *= incr;
690 8 incr *= (phase_inc);
691 }
692
693 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
694
695 2 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
696 2 inc_Val = _mm256_set_ps(lv_cimag(incr),
697 lv_creal(incr),
698 lv_cimag(incr),
699 lv_creal(incr),
700 lv_cimag(incr),
701 lv_creal(incr),
702 lv_cimag(incr),
703 lv_creal(incr));
704
705
2/2
✓ Branch 0 taken 510 times.
✓ Branch 1 taken 2 times.
512 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
706
2/2
✓ Branch 0 taken 65280 times.
✓ Branch 1 taken 510 times.
65790 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
707
708 65280 aVal = _mm256_loadu_ps((float*)aPtr);
709
710 65280 yl = _mm256_moveldup_ps(phase_Val);
711 65280 yh = _mm256_movehdup_ps(phase_Val);
712 65280 ylp = _mm256_moveldup_ps(inc_Val);
713 65280 yhp = _mm256_movehdup_ps(inc_Val);
714
715 65280 tmp1 = aVal;
716 65280 tmp1p = phase_Val;
717
718 65280 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
719 65280 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
720 65280 tmp2 = _mm256_mul_ps(aVal, yh);
721 65280 tmp2p = _mm256_mul_ps(phase_Val, yhp);
722
723 65280 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
724 65280 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
725
726 _mm256_storeu_ps((float*)cPtr, z);
727
728 65280 aPtr += 4;
729 65280 cPtr += 4;
730 }
731 510 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
732 510 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
733 510 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
734 510 tmp2 = _mm256_sqrt_ps(tmp1);
735 510 phase_Val = _mm256_div_ps(phase_Val, tmp2);
736 }
737
2/2
✓ Branch 0 taken 254 times.
✓ Branch 1 taken 2 times.
256 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
738 254 aVal = _mm256_loadu_ps((float*)aPtr);
739
740 254 yl = _mm256_moveldup_ps(phase_Val);
741 254 yh = _mm256_movehdup_ps(phase_Val);
742 254 ylp = _mm256_moveldup_ps(inc_Val);
743 254 yhp = _mm256_movehdup_ps(inc_Val);
744
745 254 tmp1 = aVal;
746 254 tmp1p = phase_Val;
747
748 254 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
749 254 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
750 254 tmp2 = _mm256_mul_ps(aVal, yh);
751 254 tmp2p = _mm256_mul_ps(phase_Val, yhp);
752
753 254 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
754 254 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
755
756 _mm256_storeu_ps((float*)cPtr, z);
757
758 254 aPtr += 4;
759 254 cPtr += 4;
760 }
761
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (i) {
762 2 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
763 2 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
764 2 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
765 2 tmp2 = _mm256_sqrt_ps(tmp1);
766 2 phase_Val = _mm256_div_ps(phase_Val, tmp2);
767 }
768
769 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
770
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = 0; i < num_points % 4; ++i) {
771 6 *cPtr++ = *aPtr++ * phase_Ptr[0];
772 6 phase_Ptr[0] *= (phase_inc);
773 }
774
775 2 (*phase) = phase_Ptr[0];
776 2 }
777
778 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
779
780 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
781