Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_8ic_x2_s32f_multiply_conjugate_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Multiplys the one complex vector with the complex conjugate of the | ||
16 | * second complex vector and stores their results in the third vector | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* | ||
21 | * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li aVector: One of the complex vectors to be multiplied. | ||
25 | * \li bVector: The complex vector which will be converted to complex conjugate and | ||
26 | * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The | ||
27 | * number of complex values in aVector and bVector to be multiplied together and stored | ||
28 | * into cVector. | ||
29 | * | ||
30 | * \b Outputs | ||
31 | * \li cVector: The complex vector where the results will be stored. | ||
32 | * | ||
33 | * \b Example | ||
34 | * \code | ||
35 | * int N = 10000; | ||
36 | * | ||
37 | * <FIXME> | ||
38 | * | ||
39 | * volk_8ic_x2_s32f_multiply_conjugate_32fc(); | ||
40 | * | ||
41 | * \endcode | ||
42 | */ | ||
43 | |||
44 | #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H | ||
45 | #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H | ||
46 | |||
47 | #include <inttypes.h> | ||
48 | #include <stdio.h> | ||
49 | #include <volk/volk_complex.h> | ||
50 | |||
51 | #ifdef LV_HAVE_AVX2 | ||
52 | #include <immintrin.h> | ||
53 | |||
54 | static inline void | ||
55 | 2 | volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, | |
56 | const lv_8sc_t* aVector, | ||
57 | const lv_8sc_t* bVector, | ||
58 | const float scalar, | ||
59 | unsigned int num_points) | ||
60 | { | ||
61 | 2 | unsigned int number = 0; | |
62 | 2 | const unsigned int oneEigthPoints = num_points / 8; | |
63 | |||
64 | __m256i x, y, realz, imagz; | ||
65 | __m256 ret, retlo, rethi; | ||
66 | 2 | lv_32fc_t* c = cVector; | |
67 | 2 | const lv_8sc_t* a = aVector; | |
68 | 2 | const lv_8sc_t* b = bVector; | |
69 | __m256i conjugateSign = | ||
70 | 2 | _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); | |
71 | |||
72 | 2 | __m256 invScalar = _mm256_set1_ps(1.0 / scalar); | |
73 | |||
74 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < oneEigthPoints; number++) { |
75 | // Convert 8 bit values into 16 bit values | ||
76 | 65532 | x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); | |
77 | 65532 | y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); | |
78 | |||
79 | // Calculate the ar*cr - ai*(-ci) portions | ||
80 | 32766 | realz = _mm256_madd_epi16(x, y); | |
81 | |||
82 | // Calculate the complex conjugate of the cr + ci j values | ||
83 | 32766 | y = _mm256_sign_epi16(y, conjugateSign); | |
84 | |||
85 | // Shift the order of the cr and ci values | ||
86 | 32766 | y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), | |
87 | _MM_SHUFFLE(2, 3, 0, 1)); | ||
88 | |||
89 | // Calculate the ar*(-ci) + cr*(ai) | ||
90 | 32766 | imagz = _mm256_madd_epi16(x, y); | |
91 | |||
92 | // Interleave real and imaginary and then convert to float values | ||
93 | 65532 | retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); | |
94 | |||
95 | // Normalize the floating point values | ||
96 | 32766 | retlo = _mm256_mul_ps(retlo, invScalar); | |
97 | |||
98 | // Interleave real and imaginary and then convert to float values | ||
99 | 65532 | rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); | |
100 | |||
101 | // Normalize the floating point values | ||
102 | 32766 | rethi = _mm256_mul_ps(rethi, invScalar); | |
103 | |||
104 | 32766 | ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); | |
105 | _mm256_store_ps((float*)c, ret); | ||
106 | 32766 | c += 4; | |
107 | |||
108 | 32766 | ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); | |
109 | _mm256_store_ps((float*)c, ret); | ||
110 | 32766 | c += 4; | |
111 | |||
112 | 32766 | a += 8; | |
113 | 32766 | b += 8; | |
114 | } | ||
115 | |||
116 | 2 | number = oneEigthPoints * 8; | |
117 | 2 | float* cFloatPtr = (float*)&cVector[number]; | |
118 | 2 | int8_t* a8Ptr = (int8_t*)&aVector[number]; | |
119 | 2 | int8_t* b8Ptr = (int8_t*)&bVector[number]; | |
120 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
121 | 14 | float aReal = (float)*a8Ptr++; | |
122 | 14 | float aImag = (float)*a8Ptr++; | |
123 | 14 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
124 | 14 | float bReal = (float)*b8Ptr++; | |
125 | 14 | float bImag = (float)*b8Ptr++; | |
126 | 14 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
127 | 14 | lv_32fc_t temp = aVal * bVal; | |
128 | |||
129 | 14 | *cFloatPtr++ = lv_creal(temp) / scalar; | |
130 | 14 | *cFloatPtr++ = lv_cimag(temp) / scalar; | |
131 | } | ||
132 | 2 | } | |
133 | #endif /* LV_HAVE_AVX2*/ | ||
134 | |||
135 | |||
136 | #ifdef LV_HAVE_SSE4_1 | ||
137 | #include <smmintrin.h> | ||
138 | |||
139 | static inline void | ||
140 | 2 | volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, | |
141 | const lv_8sc_t* aVector, | ||
142 | const lv_8sc_t* bVector, | ||
143 | const float scalar, | ||
144 | unsigned int num_points) | ||
145 | { | ||
146 | 2 | unsigned int number = 0; | |
147 | 2 | const unsigned int quarterPoints = num_points / 4; | |
148 | |||
149 | __m128i x, y, realz, imagz; | ||
150 | __m128 ret; | ||
151 | 2 | lv_32fc_t* c = cVector; | |
152 | 2 | const lv_8sc_t* a = aVector; | |
153 | 2 | const lv_8sc_t* b = bVector; | |
154 | 2 | __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); | |
155 | |||
156 | 2 | __m128 invScalar = _mm_set_ps1(1.0 / scalar); | |
157 | |||
158 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
159 | // Convert into 8 bit values into 16 bit values | ||
160 | 131068 | x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); | |
161 | 131068 | y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); | |
162 | |||
163 | // Calculate the ar*cr - ai*(-ci) portions | ||
164 | 65534 | realz = _mm_madd_epi16(x, y); | |
165 | |||
166 | // Calculate the complex conjugate of the cr + ci j values | ||
167 | 65534 | y = _mm_sign_epi16(y, conjugateSign); | |
168 | |||
169 | // Shift the order of the cr and ci values | ||
170 | 65534 | y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), | |
171 | _MM_SHUFFLE(2, 3, 0, 1)); | ||
172 | |||
173 | // Calculate the ar*(-ci) + cr*(ai) | ||
174 | 65534 | imagz = _mm_madd_epi16(x, y); | |
175 | |||
176 | // Interleave real and imaginary and then convert to float values | ||
177 | 131068 | ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); | |
178 | |||
179 | // Normalize the floating point values | ||
180 | 65534 | ret = _mm_mul_ps(ret, invScalar); | |
181 | |||
182 | // Store the floating point values | ||
183 | _mm_store_ps((float*)c, ret); | ||
184 | 65534 | c += 2; | |
185 | |||
186 | // Interleave real and imaginary and then convert to float values | ||
187 | 131068 | ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); | |
188 | |||
189 | // Normalize the floating point values | ||
190 | 65534 | ret = _mm_mul_ps(ret, invScalar); | |
191 | |||
192 | // Store the floating point values | ||
193 | _mm_store_ps((float*)c, ret); | ||
194 | 65534 | c += 2; | |
195 | |||
196 | 65534 | a += 4; | |
197 | 65534 | b += 4; | |
198 | } | ||
199 | |||
200 | 2 | number = quarterPoints * 4; | |
201 | 2 | float* cFloatPtr = (float*)&cVector[number]; | |
202 | 2 | int8_t* a8Ptr = (int8_t*)&aVector[number]; | |
203 | 2 | int8_t* b8Ptr = (int8_t*)&bVector[number]; | |
204 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
205 | 6 | float aReal = (float)*a8Ptr++; | |
206 | 6 | float aImag = (float)*a8Ptr++; | |
207 | 6 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
208 | 6 | float bReal = (float)*b8Ptr++; | |
209 | 6 | float bImag = (float)*b8Ptr++; | |
210 | 6 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
211 | 6 | lv_32fc_t temp = aVal * bVal; | |
212 | |||
213 | 6 | *cFloatPtr++ = lv_creal(temp) / scalar; | |
214 | 6 | *cFloatPtr++ = lv_cimag(temp) / scalar; | |
215 | } | ||
216 | 2 | } | |
217 | #endif /* LV_HAVE_SSE4_1 */ | ||
218 | |||
219 | |||
220 | #ifdef LV_HAVE_GENERIC | ||
221 | |||
222 | static inline void | ||
223 | 2 | volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, | |
224 | const lv_8sc_t* aVector, | ||
225 | const lv_8sc_t* bVector, | ||
226 | const float scalar, | ||
227 | unsigned int num_points) | ||
228 | { | ||
229 | 2 | unsigned int number = 0; | |
230 | 2 | float* cPtr = (float*)cVector; | |
231 | 2 | const float invScalar = 1.0 / scalar; | |
232 | 2 | int8_t* a8Ptr = (int8_t*)aVector; | |
233 | 2 | int8_t* b8Ptr = (int8_t*)bVector; | |
234 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
235 | 262142 | float aReal = (float)*a8Ptr++; | |
236 | 262142 | float aImag = (float)*a8Ptr++; | |
237 | 262142 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
238 | 262142 | float bReal = (float)*b8Ptr++; | |
239 | 262142 | float bImag = (float)*b8Ptr++; | |
240 | 262142 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
241 | 262142 | lv_32fc_t temp = aVal * bVal; | |
242 | |||
243 | 262142 | *cPtr++ = (lv_creal(temp) * invScalar); | |
244 | 262142 | *cPtr++ = (lv_cimag(temp) * invScalar); | |
245 | } | ||
246 | 2 | } | |
247 | #endif /* LV_HAVE_GENERIC */ | ||
248 | |||
249 | |||
250 | #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */ | ||
251 | |||
252 | #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H | ||
253 | #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H | ||
254 | |||
255 | #include <inttypes.h> | ||
256 | #include <stdio.h> | ||
257 | #include <volk/volk_complex.h> | ||
258 | |||
259 | #ifdef LV_HAVE_AVX2 | ||
260 | #include <immintrin.h> | ||
261 | |||
262 | static inline void | ||
263 | 2 | volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, | |
264 | const lv_8sc_t* aVector, | ||
265 | const lv_8sc_t* bVector, | ||
266 | const float scalar, | ||
267 | unsigned int num_points) | ||
268 | { | ||
269 | 2 | unsigned int number = 0; | |
270 | 2 | const unsigned int oneEigthPoints = num_points / 8; | |
271 | |||
272 | __m256i x, y, realz, imagz; | ||
273 | __m256 ret, retlo, rethi; | ||
274 | 2 | lv_32fc_t* c = cVector; | |
275 | 2 | const lv_8sc_t* a = aVector; | |
276 | 2 | const lv_8sc_t* b = bVector; | |
277 | __m256i conjugateSign = | ||
278 | 2 | _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); | |
279 | |||
280 | 2 | __m256 invScalar = _mm256_set1_ps(1.0 / scalar); | |
281 | |||
282 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < oneEigthPoints; number++) { |
283 | // Convert 8 bit values into 16 bit values | ||
284 | 65532 | x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); | |
285 | 65532 | y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); | |
286 | |||
287 | // Calculate the ar*cr - ai*(-ci) portions | ||
288 | 32766 | realz = _mm256_madd_epi16(x, y); | |
289 | |||
290 | // Calculate the complex conjugate of the cr + ci j values | ||
291 | 32766 | y = _mm256_sign_epi16(y, conjugateSign); | |
292 | |||
293 | // Shift the order of the cr and ci values | ||
294 | 32766 | y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), | |
295 | _MM_SHUFFLE(2, 3, 0, 1)); | ||
296 | |||
297 | // Calculate the ar*(-ci) + cr*(ai) | ||
298 | 32766 | imagz = _mm256_madd_epi16(x, y); | |
299 | |||
300 | // Interleave real and imaginary and then convert to float values | ||
301 | 65532 | retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); | |
302 | |||
303 | // Normalize the floating point values | ||
304 | 32766 | retlo = _mm256_mul_ps(retlo, invScalar); | |
305 | |||
306 | // Interleave real and imaginary and then convert to float values | ||
307 | 65532 | rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); | |
308 | |||
309 | // Normalize the floating point values | ||
310 | 32766 | rethi = _mm256_mul_ps(rethi, invScalar); | |
311 | |||
312 | 32766 | ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); | |
313 | _mm256_storeu_ps((float*)c, ret); | ||
314 | 32766 | c += 4; | |
315 | |||
316 | 32766 | ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); | |
317 | _mm256_storeu_ps((float*)c, ret); | ||
318 | 32766 | c += 4; | |
319 | |||
320 | 32766 | a += 8; | |
321 | 32766 | b += 8; | |
322 | } | ||
323 | |||
324 | 2 | number = oneEigthPoints * 8; | |
325 | 2 | float* cFloatPtr = (float*)&cVector[number]; | |
326 | 2 | int8_t* a8Ptr = (int8_t*)&aVector[number]; | |
327 | 2 | int8_t* b8Ptr = (int8_t*)&bVector[number]; | |
328 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
329 | 14 | float aReal = (float)*a8Ptr++; | |
330 | 14 | float aImag = (float)*a8Ptr++; | |
331 | 14 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
332 | 14 | float bReal = (float)*b8Ptr++; | |
333 | 14 | float bImag = (float)*b8Ptr++; | |
334 | 14 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
335 | 14 | lv_32fc_t temp = aVal * bVal; | |
336 | |||
337 | 14 | *cFloatPtr++ = lv_creal(temp) / scalar; | |
338 | 14 | *cFloatPtr++ = lv_cimag(temp) / scalar; | |
339 | } | ||
340 | 2 | } | |
341 | #endif /* LV_HAVE_AVX2*/ | ||
342 | |||
343 | |||
344 | #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */ | ||
345 |