GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 135 135 100.0%
Functions: 4 4 100.0%
Branches: 14 14 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_8ic_x2_s32f_multiply_conjugate_32fc
12 *
13 * \b Overview
14 *
15 * Multiplys the one complex vector with the complex conjugate of the
16 * second complex vector and stores their results in the third vector
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
21 * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li aVector: One of the complex vectors to be multiplied.
25 * \li bVector: The complex vector which will be converted to complex conjugate and
26 * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
27 * number of complex values in aVector and bVector to be multiplied together and stored
28 * into cVector.
29 *
30 * \b Outputs
31 * \li cVector: The complex vector where the results will be stored.
32 *
33 * \b Example
34 * \code
35 * int N = 10000;
36 *
37 * <FIXME>
38 *
39 * volk_8ic_x2_s32f_multiply_conjugate_32fc();
40 *
41 * \endcode
42 */
43
44 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46
47 #include <inttypes.h>
48 #include <stdio.h>
49 #include <volk/volk_complex.h>
50
51 #ifdef LV_HAVE_AVX2
52 #include <immintrin.h>
53
54 static inline void
55 2 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56 const lv_8sc_t* aVector,
57 const lv_8sc_t* bVector,
58 const float scalar,
59 unsigned int num_points)
60 {
61 2 unsigned int number = 0;
62 2 const unsigned int oneEigthPoints = num_points / 8;
63
64 __m256i x, y, realz, imagz;
65 __m256 ret, retlo, rethi;
66 2 lv_32fc_t* c = cVector;
67 2 const lv_8sc_t* a = aVector;
68 2 const lv_8sc_t* b = bVector;
69 __m256i conjugateSign =
70 2 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71
72 2 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73
74
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < oneEigthPoints; number++) {
75 // Convert 8 bit values into 16 bit values
76 65532 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77 65532 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78
79 // Calculate the ar*cr - ai*(-ci) portions
80 32766 realz = _mm256_madd_epi16(x, y);
81
82 // Calculate the complex conjugate of the cr + ci j values
83 32766 y = _mm256_sign_epi16(y, conjugateSign);
84
85 // Shift the order of the cr and ci values
86 32766 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87 _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Calculate the ar*(-ci) + cr*(ai)
90 32766 imagz = _mm256_madd_epi16(x, y);
91
92 // Interleave real and imaginary and then convert to float values
93 65532 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94
95 // Normalize the floating point values
96 32766 retlo = _mm256_mul_ps(retlo, invScalar);
97
98 // Interleave real and imaginary and then convert to float values
99 65532 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100
101 // Normalize the floating point values
102 32766 rethi = _mm256_mul_ps(rethi, invScalar);
103
104 32766 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105 _mm256_store_ps((float*)c, ret);
106 32766 c += 4;
107
108 32766 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109 _mm256_store_ps((float*)c, ret);
110 32766 c += 4;
111
112 32766 a += 8;
113 32766 b += 8;
114 }
115
116 2 number = oneEigthPoints * 8;
117 2 float* cFloatPtr = (float*)&cVector[number];
118 2 int8_t* a8Ptr = (int8_t*)&aVector[number];
119 2 int8_t* b8Ptr = (int8_t*)&bVector[number];
120
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
121 14 float aReal = (float)*a8Ptr++;
122 14 float aImag = (float)*a8Ptr++;
123 14 lv_32fc_t aVal = lv_cmake(aReal, aImag);
124 14 float bReal = (float)*b8Ptr++;
125 14 float bImag = (float)*b8Ptr++;
126 14 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127 14 lv_32fc_t temp = aVal * bVal;
128
129 14 *cFloatPtr++ = lv_creal(temp) / scalar;
130 14 *cFloatPtr++ = lv_cimag(temp) / scalar;
131 }
132 2 }
133 #endif /* LV_HAVE_AVX2*/
134
135
136 #ifdef LV_HAVE_SSE4_1
137 #include <smmintrin.h>
138
139 static inline void
140 2 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141 const lv_8sc_t* aVector,
142 const lv_8sc_t* bVector,
143 const float scalar,
144 unsigned int num_points)
145 {
146 2 unsigned int number = 0;
147 2 const unsigned int quarterPoints = num_points / 4;
148
149 __m128i x, y, realz, imagz;
150 __m128 ret;
151 2 lv_32fc_t* c = cVector;
152 2 const lv_8sc_t* a = aVector;
153 2 const lv_8sc_t* b = bVector;
154 2 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155
156 2 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157
158
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
159 // Convert into 8 bit values into 16 bit values
160 131068 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161 131068 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
162
163 // Calculate the ar*cr - ai*(-ci) portions
164 65534 realz = _mm_madd_epi16(x, y);
165
166 // Calculate the complex conjugate of the cr + ci j values
167 65534 y = _mm_sign_epi16(y, conjugateSign);
168
169 // Shift the order of the cr and ci values
170 65534 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171 _MM_SHUFFLE(2, 3, 0, 1));
172
173 // Calculate the ar*(-ci) + cr*(ai)
174 65534 imagz = _mm_madd_epi16(x, y);
175
176 // Interleave real and imaginary and then convert to float values
177 131068 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178
179 // Normalize the floating point values
180 65534 ret = _mm_mul_ps(ret, invScalar);
181
182 // Store the floating point values
183 _mm_store_ps((float*)c, ret);
184 65534 c += 2;
185
186 // Interleave real and imaginary and then convert to float values
187 131068 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188
189 // Normalize the floating point values
190 65534 ret = _mm_mul_ps(ret, invScalar);
191
192 // Store the floating point values
193 _mm_store_ps((float*)c, ret);
194 65534 c += 2;
195
196 65534 a += 4;
197 65534 b += 4;
198 }
199
200 2 number = quarterPoints * 4;
201 2 float* cFloatPtr = (float*)&cVector[number];
202 2 int8_t* a8Ptr = (int8_t*)&aVector[number];
203 2 int8_t* b8Ptr = (int8_t*)&bVector[number];
204
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
205 6 float aReal = (float)*a8Ptr++;
206 6 float aImag = (float)*a8Ptr++;
207 6 lv_32fc_t aVal = lv_cmake(aReal, aImag);
208 6 float bReal = (float)*b8Ptr++;
209 6 float bImag = (float)*b8Ptr++;
210 6 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211 6 lv_32fc_t temp = aVal * bVal;
212
213 6 *cFloatPtr++ = lv_creal(temp) / scalar;
214 6 *cFloatPtr++ = lv_cimag(temp) / scalar;
215 }
216 2 }
217 #endif /* LV_HAVE_SSE4_1 */
218
219
220 #ifdef LV_HAVE_GENERIC
221
222 static inline void
223 2 volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
224 const lv_8sc_t* aVector,
225 const lv_8sc_t* bVector,
226 const float scalar,
227 unsigned int num_points)
228 {
229 2 unsigned int number = 0;
230 2 float* cPtr = (float*)cVector;
231 2 const float invScalar = 1.0 / scalar;
232 2 int8_t* a8Ptr = (int8_t*)aVector;
233 2 int8_t* b8Ptr = (int8_t*)bVector;
234
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
235 262142 float aReal = (float)*a8Ptr++;
236 262142 float aImag = (float)*a8Ptr++;
237 262142 lv_32fc_t aVal = lv_cmake(aReal, aImag);
238 262142 float bReal = (float)*b8Ptr++;
239 262142 float bImag = (float)*b8Ptr++;
240 262142 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241 262142 lv_32fc_t temp = aVal * bVal;
242
243 262142 *cPtr++ = (lv_creal(temp) * invScalar);
244 262142 *cPtr++ = (lv_cimag(temp) * invScalar);
245 }
246 2 }
247 #endif /* LV_HAVE_GENERIC */
248
249
250 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
251
252 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
254
255 #include <inttypes.h>
256 #include <stdio.h>
257 #include <volk/volk_complex.h>
258
259 #ifdef LV_HAVE_AVX2
260 #include <immintrin.h>
261
262 static inline void
263 2 volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
264 const lv_8sc_t* aVector,
265 const lv_8sc_t* bVector,
266 const float scalar,
267 unsigned int num_points)
268 {
269 2 unsigned int number = 0;
270 2 const unsigned int oneEigthPoints = num_points / 8;
271
272 __m256i x, y, realz, imagz;
273 __m256 ret, retlo, rethi;
274 2 lv_32fc_t* c = cVector;
275 2 const lv_8sc_t* a = aVector;
276 2 const lv_8sc_t* b = bVector;
277 __m256i conjugateSign =
278 2 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
279
280 2 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
281
282
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < oneEigthPoints; number++) {
283 // Convert 8 bit values into 16 bit values
284 65532 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285 65532 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
286
287 // Calculate the ar*cr - ai*(-ci) portions
288 32766 realz = _mm256_madd_epi16(x, y);
289
290 // Calculate the complex conjugate of the cr + ci j values
291 32766 y = _mm256_sign_epi16(y, conjugateSign);
292
293 // Shift the order of the cr and ci values
294 32766 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295 _MM_SHUFFLE(2, 3, 0, 1));
296
297 // Calculate the ar*(-ci) + cr*(ai)
298 32766 imagz = _mm256_madd_epi16(x, y);
299
300 // Interleave real and imaginary and then convert to float values
301 65532 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
302
303 // Normalize the floating point values
304 32766 retlo = _mm256_mul_ps(retlo, invScalar);
305
306 // Interleave real and imaginary and then convert to float values
307 65532 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
308
309 // Normalize the floating point values
310 32766 rethi = _mm256_mul_ps(rethi, invScalar);
311
312 32766 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313 _mm256_storeu_ps((float*)c, ret);
314 32766 c += 4;
315
316 32766 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317 _mm256_storeu_ps((float*)c, ret);
318 32766 c += 4;
319
320 32766 a += 8;
321 32766 b += 8;
322 }
323
324 2 number = oneEigthPoints * 8;
325 2 float* cFloatPtr = (float*)&cVector[number];
326 2 int8_t* a8Ptr = (int8_t*)&aVector[number];
327 2 int8_t* b8Ptr = (int8_t*)&bVector[number];
328
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
329 14 float aReal = (float)*a8Ptr++;
330 14 float aImag = (float)*a8Ptr++;
331 14 lv_32fc_t aVal = lv_cmake(aReal, aImag);
332 14 float bReal = (float)*b8Ptr++;
333 14 float bImag = (float)*b8Ptr++;
334 14 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
335 14 lv_32fc_t temp = aVal * bVal;
336
337 14 *cFloatPtr++ = lv_creal(temp) / scalar;
338 14 *cFloatPtr++ = lv_cimag(temp) / scalar;
339 }
340 2 }
341 #endif /* LV_HAVE_AVX2*/
342
343
344 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
345