Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16ic_s32f_magnitude_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes the magnitude of the complexVector and stores the results | ||
16 | * in the magnitudeVector as a scaled floating point number. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* | ||
21 | * complexVector, const float scalar, unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li complexVector: The complex input vector of complex 16-bit shorts. | ||
25 | * \li scalar: The value to be divided against each sample of the input complex vector. | ||
26 | * \li num_points: The number of samples. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li magnitudeVector: The magnitude of the complex values. | ||
30 | * | ||
31 | * \b Example | ||
32 | * \code | ||
33 | * int N = 10000; | ||
34 | * | ||
35 | * volk_16ic_s32f_magnitude_32f(); | ||
36 | * | ||
37 | * volk_free(x); | ||
38 | * volk_free(t); | ||
39 | * \endcode | ||
40 | */ | ||
41 | |||
42 | #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H | ||
43 | #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H | ||
44 | |||
45 | #include <inttypes.h> | ||
46 | #include <math.h> | ||
47 | #include <stdio.h> | ||
48 | #include <volk/volk_common.h> | ||
49 | |||
50 | #ifdef LV_HAVE_AVX2 | ||
51 | #include <immintrin.h> | ||
52 | |||
53 | 2 | static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, | |
54 | const lv_16sc_t* complexVector, | ||
55 | const float scalar, | ||
56 | unsigned int num_points) | ||
57 | { | ||
58 | 2 | unsigned int number = 0; | |
59 | 2 | const unsigned int eighthPoints = num_points / 8; | |
60 | |||
61 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
62 | 2 | float* magnitudeVectorPtr = magnitudeVector; | |
63 | |||
64 | 4 | __m256 invScalar = _mm256_set1_ps(1.0 / scalar); | |
65 | |||
66 | __m256 cplxValue1, cplxValue2, result; | ||
67 | __m256i int1, int2; | ||
68 | __m128i short1, short2; | ||
69 | 2 | __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
70 | |||
71 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
72 | |||
73 | 32766 | int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
74 | 32766 | complexVectorPtr += 16; | |
75 | 32766 | short1 = _mm256_extracti128_si256(int1, 0); | |
76 | 32766 | short2 = _mm256_extracti128_si256(int1, 1); | |
77 | |||
78 | 32766 | int1 = _mm256_cvtepi16_epi32(short1); | |
79 | 32766 | int2 = _mm256_cvtepi16_epi32(short2); | |
80 | 32766 | cplxValue1 = _mm256_cvtepi32_ps(int1); | |
81 | 32766 | cplxValue2 = _mm256_cvtepi32_ps(int2); | |
82 | |||
83 | 32766 | cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); | |
84 | 32766 | cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); | |
85 | |||
86 | 32766 | cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values | |
87 | 32766 | cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values | |
88 | |||
89 | 32766 | result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values | |
90 | 32766 | result = _mm256_permutevar8x32_ps(result, idx); | |
91 | |||
92 | 32766 | result = _mm256_sqrt_ps(result); // Square root the values | |
93 | |||
94 | _mm256_store_ps(magnitudeVectorPtr, result); | ||
95 | |||
96 | 32766 | magnitudeVectorPtr += 8; | |
97 | } | ||
98 | |||
99 | 2 | number = eighthPoints * 8; | |
100 | 2 | magnitudeVectorPtr = &magnitudeVector[number]; | |
101 | 2 | complexVectorPtr = (const int16_t*)&complexVector[number]; | |
102 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
103 | 14 | float val1Real = (float)(*complexVectorPtr++) / scalar; | |
104 | 14 | float val1Imag = (float)(*complexVectorPtr++) / scalar; | |
105 | 14 | *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); | |
106 | } | ||
107 | 2 | } | |
108 | #endif /* LV_HAVE_AVX2 */ | ||
109 | |||
110 | |||
111 | #ifdef LV_HAVE_SSE3 | ||
112 | #include <pmmintrin.h> | ||
113 | |||
114 | 2 | static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, | |
115 | const lv_16sc_t* complexVector, | ||
116 | const float scalar, | ||
117 | unsigned int num_points) | ||
118 | { | ||
119 | 2 | unsigned int number = 0; | |
120 | 2 | const unsigned int quarterPoints = num_points / 4; | |
121 | |||
122 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
123 | 2 | float* magnitudeVectorPtr = magnitudeVector; | |
124 | |||
125 | 2 | __m128 invScalar = _mm_set_ps1(1.0 / scalar); | |
126 | |||
127 | __m128 cplxValue1, cplxValue2, result; | ||
128 | |||
129 | __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; | ||
130 | |||
131 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
132 | |||
133 | 65534 | inputFloatBuffer[0] = (float)(complexVectorPtr[0]); | |
134 | 65534 | inputFloatBuffer[1] = (float)(complexVectorPtr[1]); | |
135 | 65534 | inputFloatBuffer[2] = (float)(complexVectorPtr[2]); | |
136 | 65534 | inputFloatBuffer[3] = (float)(complexVectorPtr[3]); | |
137 | |||
138 | 65534 | inputFloatBuffer[4] = (float)(complexVectorPtr[4]); | |
139 | 65534 | inputFloatBuffer[5] = (float)(complexVectorPtr[5]); | |
140 | 65534 | inputFloatBuffer[6] = (float)(complexVectorPtr[6]); | |
141 | 65534 | inputFloatBuffer[7] = (float)(complexVectorPtr[7]); | |
142 | |||
143 | 65534 | cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); | |
144 | 65534 | cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); | |
145 | |||
146 | 65534 | complexVectorPtr += 8; | |
147 | |||
148 | 65534 | cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); | |
149 | 65534 | cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); | |
150 | |||
151 | 65534 | cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values | |
152 | 65534 | cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values | |
153 | |||
154 | 65534 | result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values | |
155 | |||
156 | 65534 | result = _mm_sqrt_ps(result); // Square root the values | |
157 | |||
158 | _mm_store_ps(magnitudeVectorPtr, result); | ||
159 | |||
160 | 65534 | magnitudeVectorPtr += 4; | |
161 | } | ||
162 | |||
163 | 2 | number = quarterPoints * 4; | |
164 | 2 | magnitudeVectorPtr = &magnitudeVector[number]; | |
165 | 2 | complexVectorPtr = (const int16_t*)&complexVector[number]; | |
166 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
167 | 6 | float val1Real = (float)(*complexVectorPtr++) / scalar; | |
168 | 6 | float val1Imag = (float)(*complexVectorPtr++) / scalar; | |
169 | 6 | *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); | |
170 | } | ||
171 | 2 | } | |
172 | #endif /* LV_HAVE_SSE3 */ | ||
173 | |||
174 | #ifdef LV_HAVE_SSE | ||
175 | #include <xmmintrin.h> | ||
176 | |||
177 | 2 | static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, | |
178 | const lv_16sc_t* complexVector, | ||
179 | const float scalar, | ||
180 | unsigned int num_points) | ||
181 | { | ||
182 | 2 | unsigned int number = 0; | |
183 | 2 | const unsigned int quarterPoints = num_points / 4; | |
184 | |||
185 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
186 | 2 | float* magnitudeVectorPtr = magnitudeVector; | |
187 | |||
188 | 2 | const float iScalar = 1.0 / scalar; | |
189 | 2 | __m128 invScalar = _mm_set_ps1(iScalar); | |
190 | |||
191 | __m128 cplxValue1, cplxValue2, result, re, im; | ||
192 | |||
193 | __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; | ||
194 | |||
195 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
196 | 65534 | inputFloatBuffer[0] = (float)(complexVectorPtr[0]); | |
197 | 65534 | inputFloatBuffer[1] = (float)(complexVectorPtr[1]); | |
198 | 65534 | inputFloatBuffer[2] = (float)(complexVectorPtr[2]); | |
199 | 65534 | inputFloatBuffer[3] = (float)(complexVectorPtr[3]); | |
200 | |||
201 | 65534 | inputFloatBuffer[4] = (float)(complexVectorPtr[4]); | |
202 | 65534 | inputFloatBuffer[5] = (float)(complexVectorPtr[5]); | |
203 | 65534 | inputFloatBuffer[6] = (float)(complexVectorPtr[6]); | |
204 | 65534 | inputFloatBuffer[7] = (float)(complexVectorPtr[7]); | |
205 | |||
206 | 65534 | cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); | |
207 | 65534 | cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); | |
208 | |||
209 | 65534 | re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); | |
210 | 65534 | im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); | |
211 | |||
212 | 65534 | complexVectorPtr += 8; | |
213 | |||
214 | 65534 | cplxValue1 = _mm_mul_ps(re, invScalar); | |
215 | 65534 | cplxValue2 = _mm_mul_ps(im, invScalar); | |
216 | |||
217 | 65534 | cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values | |
218 | 65534 | cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values | |
219 | |||
220 | 65534 | result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values | |
221 | |||
222 | 65534 | result = _mm_sqrt_ps(result); // Square root the values | |
223 | |||
224 | _mm_store_ps(magnitudeVectorPtr, result); | ||
225 | |||
226 | 65534 | magnitudeVectorPtr += 4; | |
227 | } | ||
228 | |||
229 | 2 | number = quarterPoints * 4; | |
230 | 2 | magnitudeVectorPtr = &magnitudeVector[number]; | |
231 | 2 | complexVectorPtr = (const int16_t*)&complexVector[number]; | |
232 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
233 | 6 | float val1Real = (float)(*complexVectorPtr++) * iScalar; | |
234 | 6 | float val1Imag = (float)(*complexVectorPtr++) * iScalar; | |
235 | 6 | *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); | |
236 | } | ||
237 | 2 | } | |
238 | |||
239 | |||
240 | #endif /* LV_HAVE_SSE */ | ||
241 | |||
242 | #ifdef LV_HAVE_GENERIC | ||
243 | |||
244 | 2 | static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, | |
245 | const lv_16sc_t* complexVector, | ||
246 | const float scalar, | ||
247 | unsigned int num_points) | ||
248 | { | ||
249 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
250 | 2 | float* magnitudeVectorPtr = magnitudeVector; | |
251 | 2 | unsigned int number = 0; | |
252 | 2 | const float invScalar = 1.0 / scalar; | |
253 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
254 | 262142 | float real = ((float)(*complexVectorPtr++)) * invScalar; | |
255 | 262142 | float imag = ((float)(*complexVectorPtr++)) * invScalar; | |
256 | 262142 | *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); | |
257 | } | ||
258 | 2 | } | |
259 | #endif /* LV_HAVE_GENERIC */ | ||
260 | |||
261 | |||
262 | #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */ | ||
263 | |||
264 | #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H | ||
265 | #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H | ||
266 | |||
267 | #include <inttypes.h> | ||
268 | #include <math.h> | ||
269 | #include <stdio.h> | ||
270 | #include <volk/volk_common.h> | ||
271 | |||
272 | #ifdef LV_HAVE_AVX2 | ||
273 | #include <immintrin.h> | ||
274 | |||
275 | 2 | static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, | |
276 | const lv_16sc_t* complexVector, | ||
277 | const float scalar, | ||
278 | unsigned int num_points) | ||
279 | { | ||
280 | 2 | unsigned int number = 0; | |
281 | 2 | const unsigned int eighthPoints = num_points / 8; | |
282 | |||
283 | 2 | const int16_t* complexVectorPtr = (const int16_t*)complexVector; | |
284 | 2 | float* magnitudeVectorPtr = magnitudeVector; | |
285 | |||
286 | 4 | __m256 invScalar = _mm256_set1_ps(1.0 / scalar); | |
287 | |||
288 | __m256 cplxValue1, cplxValue2, result; | ||
289 | __m256i int1, int2; | ||
290 | __m128i short1, short2; | ||
291 | 2 | __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
292 | |||
293 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
294 | |||
295 | 32766 | int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); | |
296 | 32766 | complexVectorPtr += 16; | |
297 | 32766 | short1 = _mm256_extracti128_si256(int1, 0); | |
298 | 32766 | short2 = _mm256_extracti128_si256(int1, 1); | |
299 | |||
300 | 32766 | int1 = _mm256_cvtepi16_epi32(short1); | |
301 | 32766 | int2 = _mm256_cvtepi16_epi32(short2); | |
302 | 32766 | cplxValue1 = _mm256_cvtepi32_ps(int1); | |
303 | 32766 | cplxValue2 = _mm256_cvtepi32_ps(int2); | |
304 | |||
305 | 32766 | cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); | |
306 | 32766 | cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); | |
307 | |||
308 | 32766 | cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values | |
309 | 32766 | cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values | |
310 | |||
311 | 32766 | result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values | |
312 | 32766 | result = _mm256_permutevar8x32_ps(result, idx); | |
313 | |||
314 | 32766 | result = _mm256_sqrt_ps(result); // Square root the values | |
315 | |||
316 | _mm256_storeu_ps(magnitudeVectorPtr, result); | ||
317 | |||
318 | 32766 | magnitudeVectorPtr += 8; | |
319 | } | ||
320 | |||
321 | 2 | number = eighthPoints * 8; | |
322 | 2 | magnitudeVectorPtr = &magnitudeVector[number]; | |
323 | 2 | complexVectorPtr = (const int16_t*)&complexVector[number]; | |
324 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
325 | 14 | float val1Real = (float)(*complexVectorPtr++) / scalar; | |
326 | 14 | float val1Imag = (float)(*complexVectorPtr++) / scalar; | |
327 | 14 | *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); | |
328 | } | ||
329 | 2 | } | |
330 | #endif /* LV_HAVE_AVX2 */ | ||
331 | |||
332 | #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */ | ||
333 |