GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_x2_divide_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 115 115 100.0%
Functions: 5 5 100.0%
Branches: 16 16 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_x2_divide_32fc
12 *
13 * \b Overview
14 *
15 * Divide first vector of complexes element-wise by second.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
20 * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
21 *
22 * \b Inputs
23 * \li numeratorVector: The numerator complex values.
24 * \li numeratorVector: The denumerator complex values.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li outputVector: The output vector complex floats.
29 *
30 * \b Example
31 * divide a complex vector by itself, demonstrating the result should be pretty close to
32 * 1+0j.
33 *
34 * \code
35 * int N = 10;
36 * unsigned int alignment = volk_get_alignment();
37 * lv_32fc_t* input_vector = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
38 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
39 *
40 * float delta = 2.f*M_PI / (float)N;
41 * for(unsigned int ii = 0; ii < N; ++ii){
42 * float real_1 = std::cos(0.3f * (float)ii);
43 * float imag_1 = std::sin(0.3f * (float)ii);
44 * input_vector[ii] = lv_cmake(real_1, imag_1);
45 * }
46 *
47 * volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
48 *
49 * for(unsigned int ii = 0; ii < N; ++ii){
50 * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
51 * }
52 * printf("\n");
53 *
54 * volk_free(input_vector);
55 * volk_free(out);
56 * \endcode
57 */
58
59 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
60 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
61
62 #include <float.h>
63 #include <inttypes.h>
64 #include <volk/volk_complex.h>
65
66
67 #ifdef LV_HAVE_GENERIC
68
69 4 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
70 const lv_32fc_t* aVector,
71 const lv_32fc_t* bVector,
72 unsigned int num_points)
73 {
74 4 lv_32fc_t* cPtr = cVector;
75 4 const lv_32fc_t* aPtr = aVector;
76 4 const lv_32fc_t* bPtr = bVector;
77
78
2/2
✓ Branch 0 taken 262156 times.
✓ Branch 1 taken 4 times.
262160 for (unsigned int number = 0; number < num_points; number++) {
79 262156 *cPtr++ = (*aPtr++) / (*bPtr++);
80 }
81 4 }
82 #endif /* LV_HAVE_GENERIC */
83
84
85 #ifdef LV_HAVE_SSE3
86 #include <pmmintrin.h>
87 #include <volk/volk_sse3_intrinsics.h>
88
89 2 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
90 const lv_32fc_t* numeratorVector,
91 const lv_32fc_t* denumeratorVector,
92 unsigned int num_points)
93 {
94 /*
95 * we'll do the "classical"
96 * a a b*
97 * --- = -------
98 * b |b|^2
99 * */
100 2 unsigned int number = 0;
101 2 const unsigned int quarterPoints = num_points / 4;
102
103 __m128 num01, num23, den01, den23, norm, result;
104 2 lv_32fc_t* c = cVector;
105 2 const lv_32fc_t* a = numeratorVector;
106 2 const lv_32fc_t* b = denumeratorVector;
107
108
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
109 65534 num01 = _mm_loadu_ps((float*)a); // first pair
110 65534 den01 = _mm_loadu_ps((float*)b); // first pair
111 65534 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
112 65534 a += 2;
113 65534 b += 2;
114
115 65534 num23 = _mm_loadu_ps((float*)a); // second pair
116 65534 den23 = _mm_loadu_ps((float*)b); // second pair
117 65534 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
118 65534 a += 2;
119 65534 b += 2;
120
121 65534 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
122 65534 den01 = _mm_unpacklo_ps(norm, norm);
123 65534 den23 = _mm_unpackhi_ps(norm, norm);
124
125 65534 result = _mm_div_ps(num01, den01);
126 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
127 65534 c += 2;
128 65534 result = _mm_div_ps(num23, den23);
129 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
130 65534 c += 2;
131 }
132
133 2 number *= 4;
134
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
135 6 *c = (*a) / (*b);
136 6 a++;
137 6 b++;
138 6 c++;
139 }
140 2 }
141 #endif /* LV_HAVE_SSE3 */
142
143
144 #ifdef LV_HAVE_AVX
145 #include <immintrin.h>
146 #include <volk/volk_avx_intrinsics.h>
147
148 2 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
149 const lv_32fc_t* numeratorVector,
150 const lv_32fc_t* denumeratorVector,
151 unsigned int num_points)
152 {
153 /*
154 * we'll do the "classical"
155 * a a b*
156 * --- = -------
157 * b |b|^2
158 * */
159 2 unsigned int number = 0;
160 2 const unsigned int quarterPoints = num_points / 4;
161
162 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
163 2 lv_32fc_t* c = cVector;
164 2 const lv_32fc_t* a = numeratorVector;
165 2 const lv_32fc_t* b = denumeratorVector;
166
167
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
168 65534 num = _mm256_loadu_ps(
169 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
170 65534 denum = _mm256_loadu_ps(
171 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
172 65534 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
173 65534 sq = _mm256_mul_ps(denum, denum); // Square the values
174 65534 mag_sq_un = _mm256_hadd_ps(
175 sq, sq); // obtain the actual squared magnitude, although out of order
176 65534 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
177 // best guide I found on using these functions:
178 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
179 65534 div = _mm256_div_ps(mul_conj, mag_sq);
180
181 _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
182
183 65534 a += 4;
184 65534 b += 4;
185 65534 c += 4;
186 }
187
188 2 number = quarterPoints * 4;
189
190
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
191 6 *c++ = (*a++) / (*b++);
192 }
193 2 }
194 #endif /* LV_HAVE_AVX */
195
196
197 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
198
199
200 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
201 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
202
203 #include <float.h>
204 #include <inttypes.h>
205 #include <stdio.h>
206 #include <volk/volk_complex.h>
207
208 #ifdef LV_HAVE_SSE3
209 #include <pmmintrin.h>
210 #include <volk/volk_sse3_intrinsics.h>
211
212 2 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
213 const lv_32fc_t* numeratorVector,
214 const lv_32fc_t* denumeratorVector,
215 unsigned int num_points)
216 {
217 /*
218 * we'll do the "classical"
219 * a a b*
220 * --- = -------
221 * b |b|^2
222 * */
223 2 unsigned int number = 0;
224 2 const unsigned int quarterPoints = num_points / 4;
225
226 __m128 num01, num23, den01, den23, norm, result;
227 2 lv_32fc_t* c = cVector;
228 2 const lv_32fc_t* a = numeratorVector;
229 2 const lv_32fc_t* b = denumeratorVector;
230
231
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
232 65534 num01 = _mm_load_ps((float*)a); // first pair
233 65534 den01 = _mm_load_ps((float*)b); // first pair
234 65534 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
235 65534 a += 2;
236 65534 b += 2;
237
238 65534 num23 = _mm_load_ps((float*)a); // second pair
239 65534 den23 = _mm_load_ps((float*)b); // second pair
240 65534 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
241 65534 a += 2;
242 65534 b += 2;
243
244 65534 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
245
246 65534 den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
247 65534 den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
248
249 65534 result = _mm_div_ps(num01, den01);
250 _mm_store_ps((float*)c, result); // Store the results back into the C container
251 65534 c += 2;
252 65534 result = _mm_div_ps(num23, den23);
253 _mm_store_ps((float*)c, result); // Store the results back into the C container
254 65534 c += 2;
255 }
256
257 2 number *= 4;
258
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
259 6 *c = (*a) / (*b);
260 6 a++;
261 6 b++;
262 6 c++;
263 }
264 2 }
265 #endif /* LV_HAVE_SSE */
266
267 #ifdef LV_HAVE_AVX
268 #include <immintrin.h>
269 #include <volk/volk_avx_intrinsics.h>
270
271 2 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
272 const lv_32fc_t* numeratorVector,
273 const lv_32fc_t* denumeratorVector,
274 unsigned int num_points)
275 {
276 /*
277 * Guide to AVX intrisics:
278 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
279 *
280 * we'll do the "classical"
281 * a a b*
282 * --- = -------
283 * b |b|^2
284 *
285 */
286 2 lv_32fc_t* c = cVector;
287 2 const lv_32fc_t* a = numeratorVector;
288 2 const lv_32fc_t* b = denumeratorVector;
289
290 2 const unsigned int eigthPoints = num_points / 8;
291
292 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
293
294
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned int number = 0; number < eigthPoints; number++) {
295 // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
296 32766 num01 = _mm256_load_ps((float*)a);
297 32766 denum01 = _mm256_load_ps((float*)b);
298
299 32766 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
300 32766 a += 4;
301 32766 b += 4;
302
303 32766 num23 = _mm256_load_ps((float*)a);
304 32766 denum23 = _mm256_load_ps((float*)b);
305 32766 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
306 32766 a += 4;
307 32766 b += 4;
308
309 65532 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
310 _mm256_mul_ps(denum23, denum23));
311
312 32766 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
313 32766 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
314
315 32766 result0 = _mm256_div_ps(num01, denum01);
316 32766 result1 = _mm256_div_ps(num23, denum23);
317
318 _mm256_store_ps((float*)c, result0);
319 32766 c += 4;
320 _mm256_store_ps((float*)c, result1);
321 32766 c += 4;
322 }
323
324 2 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
325 2 }
326 #endif /* LV_HAVE_AVX */
327
328
329 #ifdef LV_HAVE_NEON
330 #include <arm_neon.h>
331
332 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
333 const lv_32fc_t* aVector,
334 const lv_32fc_t* bVector,
335 unsigned int num_points)
336 {
337 lv_32fc_t* cPtr = cVector;
338 const lv_32fc_t* aPtr = aVector;
339 const lv_32fc_t* bPtr = bVector;
340
341 float32x4x2_t aVal, bVal, cVal;
342 float32x4_t bAbs, bAbsInv;
343
344 const unsigned int quarterPoints = num_points / 4;
345 unsigned int number = 0;
346 for (; number < quarterPoints; number++) {
347 aVal = vld2q_f32((const float*)(aPtr));
348 bVal = vld2q_f32((const float*)(bPtr));
349 aPtr += 4;
350 bPtr += 4;
351 __VOLK_PREFETCH(aPtr + 4);
352 __VOLK_PREFETCH(bPtr + 4);
353
354 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
355 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
356
357 bAbsInv = vrecpeq_f32(bAbs);
358 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
359 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
360
361 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
362 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
363 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
364
365 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
366 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
367 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
368
369 vst2q_f32((float*)(cPtr), cVal);
370 cPtr += 4;
371 }
372
373 for (number = quarterPoints * 4; number < num_points; number++) {
374 *cPtr++ = (*aPtr++) / (*bPtr++);
375 }
376 }
377 #endif /* LV_HAVE_NEON */
378
379
380 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
381