Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2016 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_divide_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Divide first vector of complexes element-wise by second. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, | ||
20 | * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li numeratorVector: The numerator complex values. | ||
24 | * \li numeratorVector: The denumerator complex values. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li outputVector: The output vector complex floats. | ||
29 | * | ||
30 | * \b Example | ||
31 | * divide a complex vector by itself, demonstrating the result should be pretty close to | ||
32 | * 1+0j. | ||
33 | * | ||
34 | * \code | ||
35 | * int N = 10; | ||
36 | * unsigned int alignment = volk_get_alignment(); | ||
37 | * lv_32fc_t* input_vector = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
38 | * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
39 | * | ||
40 | * float delta = 2.f*M_PI / (float)N; | ||
41 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
42 | * float real_1 = std::cos(0.3f * (float)ii); | ||
43 | * float imag_1 = std::sin(0.3f * (float)ii); | ||
44 | * input_vector[ii] = lv_cmake(real_1, imag_1); | ||
45 | * } | ||
46 | * | ||
47 | * volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N); | ||
48 | * | ||
49 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
50 | * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii])); | ||
51 | * } | ||
52 | * printf("\n"); | ||
53 | * | ||
54 | * volk_free(input_vector); | ||
55 | * volk_free(out); | ||
56 | * \endcode | ||
57 | */ | ||
58 | |||
59 | #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H | ||
60 | #define INCLUDED_volk_32fc_x2_divide_32fc_u_H | ||
61 | |||
62 | #include <float.h> | ||
63 | #include <inttypes.h> | ||
64 | #include <volk/volk_complex.h> | ||
65 | |||
66 | |||
67 | #ifdef LV_HAVE_GENERIC | ||
68 | |||
69 | 4 | static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, | |
70 | const lv_32fc_t* aVector, | ||
71 | const lv_32fc_t* bVector, | ||
72 | unsigned int num_points) | ||
73 | { | ||
74 | 4 | lv_32fc_t* cPtr = cVector; | |
75 | 4 | const lv_32fc_t* aPtr = aVector; | |
76 | 4 | const lv_32fc_t* bPtr = bVector; | |
77 | |||
78 |
2/2✓ Branch 0 taken 262156 times.
✓ Branch 1 taken 4 times.
|
262160 | for (unsigned int number = 0; number < num_points; number++) { |
79 | 262156 | *cPtr++ = (*aPtr++) / (*bPtr++); | |
80 | } | ||
81 | 4 | } | |
82 | #endif /* LV_HAVE_GENERIC */ | ||
83 | |||
84 | |||
85 | #ifdef LV_HAVE_SSE3 | ||
86 | #include <pmmintrin.h> | ||
87 | #include <volk/volk_sse3_intrinsics.h> | ||
88 | |||
89 | 2 | static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, | |
90 | const lv_32fc_t* numeratorVector, | ||
91 | const lv_32fc_t* denumeratorVector, | ||
92 | unsigned int num_points) | ||
93 | { | ||
94 | /* | ||
95 | * we'll do the "classical" | ||
96 | * a a b* | ||
97 | * --- = ------- | ||
98 | * b |b|^2 | ||
99 | * */ | ||
100 | 2 | unsigned int number = 0; | |
101 | 2 | const unsigned int quarterPoints = num_points / 4; | |
102 | |||
103 | __m128 num01, num23, den01, den23, norm, result; | ||
104 | 2 | lv_32fc_t* c = cVector; | |
105 | 2 | const lv_32fc_t* a = numeratorVector; | |
106 | 2 | const lv_32fc_t* b = denumeratorVector; | |
107 | |||
108 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
109 | 65534 | num01 = _mm_loadu_ps((float*)a); // first pair | |
110 | 65534 | den01 = _mm_loadu_ps((float*)b); // first pair | |
111 | 65534 | num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) | |
112 | 65534 | a += 2; | |
113 | 65534 | b += 2; | |
114 | |||
115 | 65534 | num23 = _mm_loadu_ps((float*)a); // second pair | |
116 | 65534 | den23 = _mm_loadu_ps((float*)b); // second pair | |
117 | 65534 | num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) | |
118 | 65534 | a += 2; | |
119 | 65534 | b += 2; | |
120 | |||
121 | 65534 | norm = _mm_magnitudesquared_ps_sse3(den01, den23); | |
122 | 65534 | den01 = _mm_unpacklo_ps(norm, norm); | |
123 | 65534 | den23 = _mm_unpackhi_ps(norm, norm); | |
124 | |||
125 | 65534 | result = _mm_div_ps(num01, den01); | |
126 | _mm_storeu_ps((float*)c, result); // Store the results back into the C container | ||
127 | 65534 | c += 2; | |
128 | 65534 | result = _mm_div_ps(num23, den23); | |
129 | _mm_storeu_ps((float*)c, result); // Store the results back into the C container | ||
130 | 65534 | c += 2; | |
131 | } | ||
132 | |||
133 | 2 | number *= 4; | |
134 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
135 | 6 | *c = (*a) / (*b); | |
136 | 6 | a++; | |
137 | 6 | b++; | |
138 | 6 | c++; | |
139 | } | ||
140 | 2 | } | |
141 | #endif /* LV_HAVE_SSE3 */ | ||
142 | |||
143 | |||
144 | #ifdef LV_HAVE_AVX | ||
145 | #include <immintrin.h> | ||
146 | #include <volk/volk_avx_intrinsics.h> | ||
147 | |||
148 | 2 | static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, | |
149 | const lv_32fc_t* numeratorVector, | ||
150 | const lv_32fc_t* denumeratorVector, | ||
151 | unsigned int num_points) | ||
152 | { | ||
153 | /* | ||
154 | * we'll do the "classical" | ||
155 | * a a b* | ||
156 | * --- = ------- | ||
157 | * b |b|^2 | ||
158 | * */ | ||
159 | 2 | unsigned int number = 0; | |
160 | 2 | const unsigned int quarterPoints = num_points / 4; | |
161 | |||
162 | __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div; | ||
163 | 2 | lv_32fc_t* c = cVector; | |
164 | 2 | const lv_32fc_t* a = numeratorVector; | |
165 | 2 | const lv_32fc_t* b = denumeratorVector; | |
166 | |||
167 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
168 | 65534 | num = _mm256_loadu_ps( | |
169 | (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... | ||
170 | 65534 | denum = _mm256_loadu_ps( | |
171 | (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... | ||
172 | 65534 | mul_conj = _mm256_complexconjugatemul_ps(num, denum); | |
173 | 65534 | sq = _mm256_mul_ps(denum, denum); // Square the values | |
174 | 65534 | mag_sq_un = _mm256_hadd_ps( | |
175 | sq, sq); // obtain the actual squared magnitude, although out of order | ||
176 | 65534 | mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them | |
177 | // best guide I found on using these functions: | ||
178 | // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 | ||
179 | 65534 | div = _mm256_div_ps(mul_conj, mag_sq); | |
180 | |||
181 | _mm256_storeu_ps((float*)c, div); // Store the results back into the C container | ||
182 | |||
183 | 65534 | a += 4; | |
184 | 65534 | b += 4; | |
185 | 65534 | c += 4; | |
186 | } | ||
187 | |||
188 | 2 | number = quarterPoints * 4; | |
189 | |||
190 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
191 | 6 | *c++ = (*a++) / (*b++); | |
192 | } | ||
193 | 2 | } | |
194 | #endif /* LV_HAVE_AVX */ | ||
195 | |||
196 | |||
197 | #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */ | ||
198 | |||
199 | |||
200 | #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H | ||
201 | #define INCLUDED_volk_32fc_x2_divide_32fc_a_H | ||
202 | |||
203 | #include <float.h> | ||
204 | #include <inttypes.h> | ||
205 | #include <stdio.h> | ||
206 | #include <volk/volk_complex.h> | ||
207 | |||
208 | #ifdef LV_HAVE_SSE3 | ||
209 | #include <pmmintrin.h> | ||
210 | #include <volk/volk_sse3_intrinsics.h> | ||
211 | |||
212 | 2 | static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, | |
213 | const lv_32fc_t* numeratorVector, | ||
214 | const lv_32fc_t* denumeratorVector, | ||
215 | unsigned int num_points) | ||
216 | { | ||
217 | /* | ||
218 | * we'll do the "classical" | ||
219 | * a a b* | ||
220 | * --- = ------- | ||
221 | * b |b|^2 | ||
222 | * */ | ||
223 | 2 | unsigned int number = 0; | |
224 | 2 | const unsigned int quarterPoints = num_points / 4; | |
225 | |||
226 | __m128 num01, num23, den01, den23, norm, result; | ||
227 | 2 | lv_32fc_t* c = cVector; | |
228 | 2 | const lv_32fc_t* a = numeratorVector; | |
229 | 2 | const lv_32fc_t* b = denumeratorVector; | |
230 | |||
231 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
232 | 65534 | num01 = _mm_load_ps((float*)a); // first pair | |
233 | 65534 | den01 = _mm_load_ps((float*)b); // first pair | |
234 | 65534 | num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) | |
235 | 65534 | a += 2; | |
236 | 65534 | b += 2; | |
237 | |||
238 | 65534 | num23 = _mm_load_ps((float*)a); // second pair | |
239 | 65534 | den23 = _mm_load_ps((float*)b); // second pair | |
240 | 65534 | num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) | |
241 | 65534 | a += 2; | |
242 | 65534 | b += 2; | |
243 | |||
244 | 65534 | norm = _mm_magnitudesquared_ps_sse3(den01, den23); | |
245 | |||
246 | 65534 | den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice | |
247 | 65534 | den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice | |
248 | |||
249 | 65534 | result = _mm_div_ps(num01, den01); | |
250 | _mm_store_ps((float*)c, result); // Store the results back into the C container | ||
251 | 65534 | c += 2; | |
252 | 65534 | result = _mm_div_ps(num23, den23); | |
253 | _mm_store_ps((float*)c, result); // Store the results back into the C container | ||
254 | 65534 | c += 2; | |
255 | } | ||
256 | |||
257 | 2 | number *= 4; | |
258 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
259 | 6 | *c = (*a) / (*b); | |
260 | 6 | a++; | |
261 | 6 | b++; | |
262 | 6 | c++; | |
263 | } | ||
264 | 2 | } | |
265 | #endif /* LV_HAVE_SSE */ | ||
266 | |||
267 | #ifdef LV_HAVE_AVX | ||
268 | #include <immintrin.h> | ||
269 | #include <volk/volk_avx_intrinsics.h> | ||
270 | |||
271 | 2 | static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, | |
272 | const lv_32fc_t* numeratorVector, | ||
273 | const lv_32fc_t* denumeratorVector, | ||
274 | unsigned int num_points) | ||
275 | { | ||
276 | /* | ||
277 | * Guide to AVX intrisics: | ||
278 | * https://software.intel.com/sites/landingpage/IntrinsicsGuide/# | ||
279 | * | ||
280 | * we'll do the "classical" | ||
281 | * a a b* | ||
282 | * --- = ------- | ||
283 | * b |b|^2 | ||
284 | * | ||
285 | */ | ||
286 | 2 | lv_32fc_t* c = cVector; | |
287 | 2 | const lv_32fc_t* a = numeratorVector; | |
288 | 2 | const lv_32fc_t* b = denumeratorVector; | |
289 | |||
290 | 2 | const unsigned int eigthPoints = num_points / 8; | |
291 | |||
292 | __m256 num01, num23, denum01, denum23, complex_result, result0, result1; | ||
293 | |||
294 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (unsigned int number = 0; number < eigthPoints; number++) { |
295 | // Load the ar + ai, br + bi ... as ar,ai,br,bi ... | ||
296 | 32766 | num01 = _mm256_load_ps((float*)a); | |
297 | 32766 | denum01 = _mm256_load_ps((float*)b); | |
298 | |||
299 | 32766 | num01 = _mm256_complexconjugatemul_ps(num01, denum01); | |
300 | 32766 | a += 4; | |
301 | 32766 | b += 4; | |
302 | |||
303 | 32766 | num23 = _mm256_load_ps((float*)a); | |
304 | 32766 | denum23 = _mm256_load_ps((float*)b); | |
305 | 32766 | num23 = _mm256_complexconjugatemul_ps(num23, denum23); | |
306 | 32766 | a += 4; | |
307 | 32766 | b += 4; | |
308 | |||
309 | 65532 | complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01), | |
310 | _mm256_mul_ps(denum23, denum23)); | ||
311 | |||
312 | 32766 | denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50); | |
313 | 32766 | denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa); | |
314 | |||
315 | 32766 | result0 = _mm256_div_ps(num01, denum01); | |
316 | 32766 | result1 = _mm256_div_ps(num23, denum23); | |
317 | |||
318 | _mm256_store_ps((float*)c, result0); | ||
319 | 32766 | c += 4; | |
320 | _mm256_store_ps((float*)c, result1); | ||
321 | 32766 | c += 4; | |
322 | } | ||
323 | |||
324 | 2 | volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8); | |
325 | 2 | } | |
326 | #endif /* LV_HAVE_AVX */ | ||
327 | |||
328 | |||
329 | #ifdef LV_HAVE_NEON | ||
330 | #include <arm_neon.h> | ||
331 | |||
332 | static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, | ||
333 | const lv_32fc_t* aVector, | ||
334 | const lv_32fc_t* bVector, | ||
335 | unsigned int num_points) | ||
336 | { | ||
337 | lv_32fc_t* cPtr = cVector; | ||
338 | const lv_32fc_t* aPtr = aVector; | ||
339 | const lv_32fc_t* bPtr = bVector; | ||
340 | |||
341 | float32x4x2_t aVal, bVal, cVal; | ||
342 | float32x4_t bAbs, bAbsInv; | ||
343 | |||
344 | const unsigned int quarterPoints = num_points / 4; | ||
345 | unsigned int number = 0; | ||
346 | for (; number < quarterPoints; number++) { | ||
347 | aVal = vld2q_f32((const float*)(aPtr)); | ||
348 | bVal = vld2q_f32((const float*)(bPtr)); | ||
349 | aPtr += 4; | ||
350 | bPtr += 4; | ||
351 | __VOLK_PREFETCH(aPtr + 4); | ||
352 | __VOLK_PREFETCH(bPtr + 4); | ||
353 | |||
354 | bAbs = vmulq_f32(bVal.val[0], bVal.val[0]); | ||
355 | bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]); | ||
356 | |||
357 | bAbsInv = vrecpeq_f32(bAbs); | ||
358 | bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); | ||
359 | bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); | ||
360 | |||
361 | cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]); | ||
362 | cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]); | ||
363 | cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv); | ||
364 | |||
365 | cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]); | ||
366 | cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]); | ||
367 | cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv); | ||
368 | |||
369 | vst2q_f32((float*)(cPtr), cVal); | ||
370 | cPtr += 4; | ||
371 | } | ||
372 | |||
373 | for (number = quarterPoints * 4; number < num_points; number++) { | ||
374 | *cPtr++ = (*aPtr++) / (*bPtr++); | ||
375 | } | ||
376 | } | ||
377 | #endif /* LV_HAVE_NEON */ | ||
378 | |||
379 | |||
380 | #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */ | ||
381 |