Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_multiply_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Multiplies two complex vectors and returns the complex result. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const | ||
20 | * lv_32fc_t* bVector, unsigned int num_points); \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li aVector: The first input vector of complex floats. | ||
24 | * \li bVector: The second input vector of complex floats. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li outputVector: The output vector complex floats. | ||
29 | * | ||
30 | * \b Example | ||
31 | * Mix two signals at f=0.3 and 0.1. | ||
32 | * \code | ||
33 | * int N = 10; | ||
34 | * unsigned int alignment = volk_get_alignment(); | ||
35 | * lv_32fc_t* sig_1 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
36 | * lv_32fc_t* sig_2 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
37 | * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
38 | * | ||
39 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
40 | * // Generate two tones | ||
41 | * float real_1 = std::cos(0.3f * (float)ii); | ||
42 | * float imag_1 = std::sin(0.3f * (float)ii); | ||
43 | * sig_1[ii] = lv_cmake(real_1, imag_1); | ||
44 | * float real_2 = std::cos(0.1f * (float)ii); | ||
45 | * float imag_2 = std::sin(0.1f * (float)ii); | ||
46 | * sig_2[ii] = lv_cmake(real_2, imag_2); | ||
47 | * } | ||
48 | * | ||
49 | * volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N); | ||
50 | * * | ||
51 | * volk_free(sig_1); | ||
52 | * volk_free(sig_2); | ||
53 | * volk_free(out); | ||
54 | * \endcode | ||
55 | */ | ||
56 | |||
57 | #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H | ||
58 | #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H | ||
59 | |||
60 | #include <float.h> | ||
61 | #include <inttypes.h> | ||
62 | #include <stdio.h> | ||
63 | #include <volk/volk_complex.h> | ||
64 | |||
65 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
66 | #include <immintrin.h> | ||
67 | /*! | ||
68 | \brief Multiplies the two input complex vectors and stores their results in the third | ||
69 | vector \param cVector The vector where the results will be stored \param aVector One of | ||
70 | the vectors to be multiplied \param bVector One of the vectors to be multiplied \param | ||
71 | num_points The number of complex values in aVector and bVector to be multiplied together | ||
72 | and stored into cVector | ||
73 | */ | ||
74 | 2 | static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, | |
75 | const lv_32fc_t* aVector, | ||
76 | const lv_32fc_t* bVector, | ||
77 | unsigned int num_points) | ||
78 | { | ||
79 | 2 | unsigned int number = 0; | |
80 | 2 | const unsigned int quarterPoints = num_points / 4; | |
81 | |||
82 | 2 | lv_32fc_t* c = cVector; | |
83 | 2 | const lv_32fc_t* a = aVector; | |
84 | 2 | const lv_32fc_t* b = bVector; | |
85 | |||
86 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
87 | |||
88 | const __m256 x = | ||
89 | 65534 | _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
90 | const __m256 y = | ||
91 | 65534 | _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
92 | |||
93 | 65534 | const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr | |
94 | 65534 | const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di | |
95 | |||
96 | 65534 | const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br | |
97 | |||
98 | 65534 | const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
99 | |||
100 | 65534 | const __m256 z = _mm256_fmaddsub_ps( | |
101 | x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
102 | |||
103 | _mm256_storeu_ps((float*)c, z); // Store the results back into the C container | ||
104 | |||
105 | 65534 | a += 4; | |
106 | 65534 | b += 4; | |
107 | 65534 | c += 4; | |
108 | } | ||
109 | |||
110 | 2 | number = quarterPoints * 4; | |
111 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
112 | 6 | *c++ = (*a++) * (*b++); | |
113 | } | ||
114 | 2 | } | |
115 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ | ||
116 | |||
117 | |||
118 | #ifdef LV_HAVE_AVX | ||
119 | #include <immintrin.h> | ||
120 | #include <volk/volk_avx_intrinsics.h> | ||
121 | |||
122 | 2 | static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, | |
123 | const lv_32fc_t* aVector, | ||
124 | const lv_32fc_t* bVector, | ||
125 | unsigned int num_points) | ||
126 | { | ||
127 | 2 | unsigned int number = 0; | |
128 | 2 | const unsigned int quarterPoints = num_points / 4; | |
129 | |||
130 | __m256 x, y, z; | ||
131 | 2 | lv_32fc_t* c = cVector; | |
132 | 2 | const lv_32fc_t* a = aVector; | |
133 | 2 | const lv_32fc_t* b = bVector; | |
134 | |||
135 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
136 | 65534 | x = _mm256_loadu_ps( | |
137 | (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... | ||
138 | 65534 | y = _mm256_loadu_ps( | |
139 | (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... | ||
140 | 65534 | z = _mm256_complexmul_ps(x, y); | |
141 | _mm256_storeu_ps((float*)c, z); // Store the results back into the C container | ||
142 | |||
143 | 65534 | a += 4; | |
144 | 65534 | b += 4; | |
145 | 65534 | c += 4; | |
146 | } | ||
147 | |||
148 | 2 | number = quarterPoints * 4; | |
149 | |||
150 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
151 | 6 | *c++ = (*a++) * (*b++); | |
152 | } | ||
153 | 2 | } | |
154 | #endif /* LV_HAVE_AVX */ | ||
155 | |||
156 | |||
157 | #ifdef LV_HAVE_SSE3 | ||
158 | #include <pmmintrin.h> | ||
159 | #include <volk/volk_sse3_intrinsics.h> | ||
160 | |||
161 | 2 | static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, | |
162 | const lv_32fc_t* aVector, | ||
163 | const lv_32fc_t* bVector, | ||
164 | unsigned int num_points) | ||
165 | { | ||
166 | 2 | unsigned int number = 0; | |
167 | 2 | const unsigned int halfPoints = num_points / 2; | |
168 | |||
169 | __m128 x, y, z; | ||
170 | 2 | lv_32fc_t* c = cVector; | |
171 | 2 | const lv_32fc_t* a = aVector; | |
172 | 2 | const lv_32fc_t* b = bVector; | |
173 | |||
174 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
175 | 131070 | x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
176 | 131070 | y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
177 | 131070 | z = _mm_complexmul_ps(x, y); | |
178 | _mm_storeu_ps((float*)c, z); // Store the results back into the C container | ||
179 | |||
180 | 131070 | a += 2; | |
181 | 131070 | b += 2; | |
182 | 131070 | c += 2; | |
183 | } | ||
184 | |||
185 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if ((num_points % 2) != 0) { |
186 | 2 | *c = (*a) * (*b); | |
187 | } | ||
188 | 2 | } | |
189 | #endif /* LV_HAVE_SSE */ | ||
190 | |||
191 | |||
192 | #ifdef LV_HAVE_GENERIC | ||
193 | |||
194 | 2 | static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, | |
195 | const lv_32fc_t* aVector, | ||
196 | const lv_32fc_t* bVector, | ||
197 | unsigned int num_points) | ||
198 | { | ||
199 | 2 | lv_32fc_t* cPtr = cVector; | |
200 | 2 | const lv_32fc_t* aPtr = aVector; | |
201 | 2 | const lv_32fc_t* bPtr = bVector; | |
202 | 2 | unsigned int number = 0; | |
203 | |||
204 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
205 | 262142 | *cPtr++ = (*aPtr++) * (*bPtr++); | |
206 | } | ||
207 | 2 | } | |
208 | #endif /* LV_HAVE_GENERIC */ | ||
209 | |||
210 | |||
211 | #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ | ||
212 | #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H | ||
213 | #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H | ||
214 | |||
215 | #include <float.h> | ||
216 | #include <inttypes.h> | ||
217 | #include <stdio.h> | ||
218 | #include <volk/volk_complex.h> | ||
219 | |||
220 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
221 | #include <immintrin.h> | ||
222 | /*! | ||
223 | \brief Multiplies the two input complex vectors and stores their results in the third | ||
224 | vector \param cVector The vector where the results will be stored \param aVector One of | ||
225 | the vectors to be multiplied \param bVector One of the vectors to be multiplied \param | ||
226 | num_points The number of complex values in aVector and bVector to be multiplied together | ||
227 | and stored into cVector | ||
228 | */ | ||
229 | 2 | static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, | |
230 | const lv_32fc_t* aVector, | ||
231 | const lv_32fc_t* bVector, | ||
232 | unsigned int num_points) | ||
233 | { | ||
234 | 2 | unsigned int number = 0; | |
235 | 2 | const unsigned int quarterPoints = num_points / 4; | |
236 | |||
237 | 2 | lv_32fc_t* c = cVector; | |
238 | 2 | const lv_32fc_t* a = aVector; | |
239 | 2 | const lv_32fc_t* b = bVector; | |
240 | |||
241 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
242 | |||
243 | const __m256 x = | ||
244 | 65534 | _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
245 | const __m256 y = | ||
246 | 65534 | _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
247 | |||
248 | 65534 | const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr | |
249 | 65534 | const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di | |
250 | |||
251 | 65534 | const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br | |
252 | |||
253 | 65534 | const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | |
254 | |||
255 | 65534 | const __m256 z = _mm256_fmaddsub_ps( | |
256 | x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||
257 | |||
258 | _mm256_store_ps((float*)c, z); // Store the results back into the C container | ||
259 | |||
260 | 65534 | a += 4; | |
261 | 65534 | b += 4; | |
262 | 65534 | c += 4; | |
263 | } | ||
264 | |||
265 | 2 | number = quarterPoints * 4; | |
266 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
267 | 6 | *c++ = (*a++) * (*b++); | |
268 | } | ||
269 | 2 | } | |
270 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ | ||
271 | |||
272 | |||
273 | #ifdef LV_HAVE_AVX | ||
274 | #include <immintrin.h> | ||
275 | #include <volk/volk_avx_intrinsics.h> | ||
276 | |||
277 | 2 | static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, | |
278 | const lv_32fc_t* aVector, | ||
279 | const lv_32fc_t* bVector, | ||
280 | unsigned int num_points) | ||
281 | { | ||
282 | 2 | unsigned int number = 0; | |
283 | 2 | const unsigned int quarterPoints = num_points / 4; | |
284 | |||
285 | __m256 x, y, z; | ||
286 | 2 | lv_32fc_t* c = cVector; | |
287 | 2 | const lv_32fc_t* a = aVector; | |
288 | 2 | const lv_32fc_t* b = bVector; | |
289 | |||
290 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
291 | 65534 | x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... | |
292 | 65534 | y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... | |
293 | 65534 | z = _mm256_complexmul_ps(x, y); | |
294 | _mm256_store_ps((float*)c, z); // Store the results back into the C container | ||
295 | |||
296 | 65534 | a += 4; | |
297 | 65534 | b += 4; | |
298 | 65534 | c += 4; | |
299 | } | ||
300 | |||
301 | 2 | number = quarterPoints * 4; | |
302 | |||
303 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
304 | 6 | *c++ = (*a++) * (*b++); | |
305 | } | ||
306 | 2 | } | |
307 | #endif /* LV_HAVE_AVX */ | ||
308 | |||
309 | #ifdef LV_HAVE_SSE3 | ||
310 | #include <pmmintrin.h> | ||
311 | #include <volk/volk_sse3_intrinsics.h> | ||
312 | |||
313 | 2 | static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, | |
314 | const lv_32fc_t* aVector, | ||
315 | const lv_32fc_t* bVector, | ||
316 | unsigned int num_points) | ||
317 | { | ||
318 | 2 | unsigned int number = 0; | |
319 | 2 | const unsigned int halfPoints = num_points / 2; | |
320 | |||
321 | __m128 x, y, z; | ||
322 | 2 | lv_32fc_t* c = cVector; | |
323 | 2 | const lv_32fc_t* a = aVector; | |
324 | 2 | const lv_32fc_t* b = bVector; | |
325 | |||
326 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
327 | 131070 | x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi | |
328 | 131070 | y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di | |
329 | 131070 | z = _mm_complexmul_ps(x, y); | |
330 | _mm_store_ps((float*)c, z); // Store the results back into the C container | ||
331 | |||
332 | 131070 | a += 2; | |
333 | 131070 | b += 2; | |
334 | 131070 | c += 2; | |
335 | } | ||
336 | |||
337 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if ((num_points % 2) != 0) { |
338 | 2 | *c = (*a) * (*b); | |
339 | } | ||
340 | 2 | } | |
341 | #endif /* LV_HAVE_SSE */ | ||
342 | |||
343 | |||
344 | #ifdef LV_HAVE_GENERIC | ||
345 | |||
346 | 2 | static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, | |
347 | const lv_32fc_t* aVector, | ||
348 | const lv_32fc_t* bVector, | ||
349 | unsigned int num_points) | ||
350 | { | ||
351 | 2 | lv_32fc_t* cPtr = cVector; | |
352 | 2 | const lv_32fc_t* aPtr = aVector; | |
353 | 2 | const lv_32fc_t* bPtr = bVector; | |
354 | 2 | unsigned int number = 0; | |
355 | |||
356 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
357 | 262142 | *cPtr++ = (*aPtr++) * (*bPtr++); | |
358 | } | ||
359 | 2 | } | |
360 | #endif /* LV_HAVE_GENERIC */ | ||
361 | |||
362 | |||
363 | #ifdef LV_HAVE_NEON | ||
364 | #include <arm_neon.h> | ||
365 | |||
366 | static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, | ||
367 | const lv_32fc_t* aVector, | ||
368 | const lv_32fc_t* bVector, | ||
369 | unsigned int num_points) | ||
370 | { | ||
371 | lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; | ||
372 | lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; | ||
373 | unsigned int quarter_points = num_points / 4; | ||
374 | float32x4x2_t a_val, b_val, c_val; | ||
375 | float32x4x2_t tmp_real, tmp_imag; | ||
376 | unsigned int number = 0; | ||
377 | |||
378 | for (number = 0; number < quarter_points; ++number) { | ||
379 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
380 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
381 | __VOLK_PREFETCH(a_ptr + 4); | ||
382 | __VOLK_PREFETCH(b_ptr + 4); | ||
383 | |||
384 | // multiply the real*real and imag*imag to get real result | ||
385 | // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||
386 | tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
387 | // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i | ||
388 | tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); | ||
389 | |||
390 | // Multiply cross terms to get the imaginary result | ||
391 | // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i | ||
392 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); | ||
393 | // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r | ||
394 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
395 | |||
396 | // store the results | ||
397 | c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); | ||
398 | c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); | ||
399 | vst2q_f32((float*)cVector, c_val); | ||
400 | |||
401 | a_ptr += 4; | ||
402 | b_ptr += 4; | ||
403 | cVector += 4; | ||
404 | } | ||
405 | |||
406 | for (number = quarter_points * 4; number < num_points; number++) { | ||
407 | *cVector++ = (*a_ptr++) * (*b_ptr++); | ||
408 | } | ||
409 | } | ||
410 | #endif /* LV_HAVE_NEON */ | ||
411 | |||
412 | |||
413 | #ifdef LV_HAVE_NEON | ||
414 | |||
415 | static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, | ||
416 | const lv_32fc_t* aVector, | ||
417 | const lv_32fc_t* bVector, | ||
418 | unsigned int num_points) | ||
419 | { | ||
420 | lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; | ||
421 | lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; | ||
422 | unsigned int quarter_points = num_points / 4; | ||
423 | float32x4x2_t a_val, b_val; | ||
424 | float32x4x2_t tmp_imag; | ||
425 | unsigned int number = 0; | ||
426 | |||
427 | for (number = 0; number < quarter_points; ++number) { | ||
428 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
429 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
430 | __VOLK_PREFETCH(a_ptr + 4); | ||
431 | __VOLK_PREFETCH(b_ptr + 4); | ||
432 | |||
433 | // do the first multiply | ||
434 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
435 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
436 | |||
437 | // use multiply accumulate/subtract to get result | ||
438 | tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); | ||
439 | tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); | ||
440 | |||
441 | // store | ||
442 | vst2q_f32((float*)cVector, tmp_imag); | ||
443 | // increment pointers | ||
444 | a_ptr += 4; | ||
445 | b_ptr += 4; | ||
446 | cVector += 4; | ||
447 | } | ||
448 | |||
449 | for (number = quarter_points * 4; number < num_points; number++) { | ||
450 | *cVector++ = (*a_ptr++) * (*b_ptr++); | ||
451 | } | ||
452 | } | ||
453 | #endif /* LV_HAVE_NEON */ | ||
454 | |||
455 | |||
456 | #ifdef LV_HAVE_NEONV7 | ||
457 | |||
458 | extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, | ||
459 | const lv_32fc_t* aVector, | ||
460 | const lv_32fc_t* bVector, | ||
461 | unsigned int num_points); | ||
462 | #endif /* LV_HAVE_NEONV7 */ | ||
463 | |||
464 | |||
465 | #ifdef LV_HAVE_ORC | ||
466 | |||
467 | extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, | ||
468 | const lv_32fc_t* aVector, | ||
469 | const lv_32fc_t* bVector, | ||
470 | unsigned int num_points); | ||
471 | |||
472 | 2 | static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, | |
473 | const lv_32fc_t* aVector, | ||
474 | const lv_32fc_t* bVector, | ||
475 | unsigned int num_points) | ||
476 | { | ||
477 | 2 | volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); | |
478 | 2 | } | |
479 | |||
480 | #endif /* LV_HAVE_ORC */ | ||
481 | |||
482 | #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ | ||
483 |