Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014, 2019 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_s32f_square_dist_scalar_mult_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Calculates the square distance between a single complex input for each | ||
16 | * point in a complex vector scaled by a scalar value. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, | ||
21 | * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li src0: The complex input. Only the first point is used. | ||
25 | * \li points: A complex vector of reference points. | ||
26 | * \li scalar: A float to scale the distances by | ||
27 | * \li num_points: The number of data points. | ||
28 | * | ||
29 | * \b Outputs | ||
30 | * \li target: A vector of distances between src0 and the vector of points. | ||
31 | * | ||
32 | * \b Example | ||
33 | * Calculate the distance between an input and reference points in a square | ||
34 | * 16-qam constellation. Normalize distances by the area of the constellation. | ||
35 | * \code | ||
36 | * int N = 16; | ||
37 | * unsigned int alignment = volk_get_alignment(); | ||
38 | * lv_32fc_t* constellation = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
39 | * lv_32fc_t* rx = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
40 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
41 | * float const_vals[] = {-3, -1, 1, 3}; | ||
42 | * | ||
43 | * unsigned int jj = 0; | ||
44 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
45 | * constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]); | ||
46 | * if((ii+1)%4 == 0) ++jj; | ||
47 | * } | ||
48 | * | ||
49 | * *rx = lv_cmake(0.5f, 2.f); | ||
50 | * float scale = 1.f/64.f; // 1 / constellation area | ||
51 | * | ||
52 | * volk_32fc_x2_s32f_square_dist_scalar_mult_32f(out, rx, constellation, scale, N); | ||
53 | * | ||
54 | * printf("Distance from each constellation point:\n"); | ||
55 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
56 | * printf("%.4f ", out[ii]); | ||
57 | * if((ii+1)%4 == 0) printf("\n"); | ||
58 | * } | ||
59 | * | ||
60 | * volk_free(rx); | ||
61 | * volk_free(constellation); | ||
62 | * volk_free(out); | ||
63 | * \endcode | ||
64 | */ | ||
65 | |||
66 | #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H | ||
67 | #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H | ||
68 | |||
69 | #include <volk/volk_complex.h> | ||
70 | |||
71 | |||
72 | 18 | static inline void calculate_scaled_distances(float* target, | |
73 | const lv_32fc_t symbol, | ||
74 | const lv_32fc_t* points, | ||
75 | const float scalar, | ||
76 | const unsigned int num_points) | ||
77 | { | ||
78 | lv_32fc_t diff; | ||
79 |
2/2✓ Branch 0 taken 262190 times.
✓ Branch 1 taken 18 times.
|
262208 | for (unsigned int i = 0; i < num_points; ++i) { |
80 | /* | ||
81 | * Calculate: |y - x|^2 * SNR_lin | ||
82 | * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++); | ||
83 | */ | ||
84 | 262190 | diff = symbol - *points++; | |
85 | 262190 | *target++ = | |
86 | 262190 | scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); | |
87 | } | ||
88 | 18 | } | |
89 | |||
90 | |||
91 | #ifdef LV_HAVE_AVX2 | ||
92 | #include <immintrin.h> | ||
93 | #include <volk/volk_avx2_intrinsics.h> | ||
94 | |||
95 | static inline void | ||
96 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, | |
97 | lv_32fc_t* src0, | ||
98 | lv_32fc_t* points, | ||
99 | float scalar, | ||
100 | unsigned int num_points) | ||
101 | { | ||
102 | 2 | const unsigned int num_bytes = num_points * 8; | |
103 | __m128 xmm9, xmm10; | ||
104 | __m256 xmm4, xmm6; | ||
105 | __m256 xmm_points0, xmm_points1, xmm_result; | ||
106 | |||
107 | 2 | const unsigned int bound = num_bytes >> 6; | |
108 | |||
109 | // load complex value into all parts of the register. | ||
110 | 2 | const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); | |
111 | 2 | const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); | |
112 | |||
113 | // Load scalar into all 8 parts of the register | ||
114 | 2 | const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); | |
115 | 2 | const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); | |
116 | |||
117 | // Set permutation constant | ||
118 | 2 | const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
119 | |||
120 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (unsigned int i = 0; i < bound; ++i) { |
121 | 32766 | xmm_points0 = _mm256_load_ps((float*)points); | |
122 | 32766 | xmm_points1 = _mm256_load_ps((float*)(points + 4)); | |
123 | 32766 | points += 8; | |
124 | 32766 | __VOLK_PREFETCH(points); | |
125 | |||
126 | 32766 | xmm_result = _mm256_scaled_norm_dist_ps_avx2( | |
127 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
128 | |||
129 | _mm256_store_ps(target, xmm_result); | ||
130 | 32766 | target += 8; | |
131 | } | ||
132 | |||
133 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 5 & 1) { |
134 | 2 | xmm_points0 = _mm256_load_ps((float*)points); | |
135 | |||
136 | 2 | xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); | |
137 | |||
138 | 2 | points += 4; | |
139 | |||
140 | 2 | xmm6 = _mm256_mul_ps(xmm4, xmm4); | |
141 | |||
142 | 2 | xmm4 = _mm256_hadd_ps(xmm6, xmm6); | |
143 | 2 | xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); | |
144 | |||
145 | 2 | xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); | |
146 | |||
147 | 2 | xmm9 = _mm256_extractf128_ps(xmm_result, 1); | |
148 | _mm_store_ps(target, xmm9); | ||
149 | 2 | target += 4; | |
150 | } | ||
151 | |||
152 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 4 & 1) { |
153 | 2 | xmm9 = _mm_load_ps((float*)points); | |
154 | |||
155 | 2 | xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); | |
156 | |||
157 | 2 | points += 2; | |
158 | |||
159 | 2 | xmm9 = _mm_mul_ps(xmm10, xmm10); | |
160 | |||
161 | 2 | xmm10 = _mm_hadd_ps(xmm9, xmm9); | |
162 | |||
163 | 2 | xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); | |
164 | |||
165 | _mm_storeh_pi((__m64*)target, xmm10); | ||
166 | 2 | target += 2; | |
167 | } | ||
168 | |||
169 | 2 | calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); | |
170 | 2 | } | |
171 | |||
172 | #endif /*LV_HAVE_AVX2*/ | ||
173 | |||
174 | |||
175 | #ifdef LV_HAVE_AVX | ||
176 | #include <immintrin.h> | ||
177 | #include <volk/volk_avx_intrinsics.h> | ||
178 | |||
179 | static inline void | ||
180 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target, | |
181 | lv_32fc_t* src0, | ||
182 | lv_32fc_t* points, | ||
183 | float scalar, | ||
184 | unsigned int num_points) | ||
185 | { | ||
186 | 2 | const int eightsPoints = num_points / 8; | |
187 | 2 | const int remainder = num_points - 8 * eightsPoints; | |
188 | |||
189 | __m256 xmm_points0, xmm_points1, xmm_result; | ||
190 | |||
191 | // load complex value into all parts of the register. | ||
192 | 4 | const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); | |
193 | |||
194 | // Load scalar into all 8 parts of the register | ||
195 | 2 | const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); | |
196 | |||
197 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (int i = 0; i < eightsPoints; ++i) { |
198 | 32766 | xmm_points0 = _mm256_load_ps((float*)points); | |
199 | 32766 | xmm_points1 = _mm256_load_ps((float*)(points + 4)); | |
200 | 32766 | points += 8; | |
201 | |||
202 | 32766 | xmm_result = _mm256_scaled_norm_dist_ps( | |
203 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
204 | |||
205 | _mm256_store_ps(target, xmm_result); | ||
206 | 32766 | target += 8; | |
207 | } | ||
208 | |||
209 | 2 | const lv_32fc_t symbol = *src0; | |
210 | 2 | calculate_scaled_distances(target, symbol, points, scalar, remainder); | |
211 | 2 | } | |
212 | |||
213 | #endif /* LV_HAVE_AVX */ | ||
214 | |||
215 | |||
216 | #ifdef LV_HAVE_SSE3 | ||
217 | #include <pmmintrin.h> | ||
218 | #include <volk/volk_sse3_intrinsics.h> | ||
219 | |||
220 | static inline void | ||
221 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, | |
222 | lv_32fc_t* src0, | ||
223 | lv_32fc_t* points, | ||
224 | float scalar, | ||
225 | unsigned int num_points) | ||
226 | { | ||
227 | __m128 xmm_points0, xmm_points1, xmm_result; | ||
228 | |||
229 | /* | ||
230 | * First do 4 values in every loop iteration. | ||
231 | * There may be up to 3 values left. | ||
232 | * leftovers0 indicates if at least 2 more are available for SSE execution. | ||
233 | * leftovers1 indicates if there is a single element left. | ||
234 | */ | ||
235 | 2 | const int quarterPoints = num_points / 4; | |
236 | 2 | const int leftovers0 = (num_points / 2) - 2 * quarterPoints; | |
237 | 2 | const int leftovers1 = num_points % 2; | |
238 | |||
239 | // load complex value into both parts of the register. | ||
240 | 4 | const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); | |
241 | |||
242 | // Load scalar into all 4 parts of the register | ||
243 | 2 | const __m128 xmm_scalar = _mm_load1_ps(&scalar); | |
244 | |||
245 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (int i = 0; i < quarterPoints; ++i) { |
246 | 65534 | xmm_points0 = _mm_load_ps((float*)points); | |
247 | 65534 | xmm_points1 = _mm_load_ps((float*)(points + 2)); | |
248 | 65534 | points += 4; | |
249 | 65534 | __VOLK_PREFETCH(points); | |
250 | // calculate distances | ||
251 | 65534 | xmm_result = _mm_scaled_norm_dist_ps_sse3( | |
252 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
253 | |||
254 | _mm_store_ps(target, xmm_result); | ||
255 | 65534 | target += 4; | |
256 | } | ||
257 | |||
258 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (int i = 0; i < leftovers0; ++i) { |
259 | 2 | xmm_points0 = _mm_load_ps((float*)points); | |
260 | 2 | points += 2; | |
261 | |||
262 | 2 | xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); | |
263 | 2 | xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); | |
264 | 2 | xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); | |
265 | 2 | xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); | |
266 | |||
267 | _mm_storeh_pi((__m64*)target, xmm_result); | ||
268 | 2 | target += 2; | |
269 | } | ||
270 | |||
271 | 2 | calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); | |
272 | 2 | } | |
273 | |||
274 | #endif /*LV_HAVE_SSE3*/ | ||
275 | |||
276 | #ifdef LV_HAVE_SSE | ||
277 | #include <volk/volk_sse_intrinsics.h> | ||
278 | #include <xmmintrin.h> | ||
279 | static inline void | ||
280 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, | |
281 | lv_32fc_t* src0, | ||
282 | lv_32fc_t* points, | ||
283 | float scalar, | ||
284 | unsigned int num_points) | ||
285 | { | ||
286 | 2 | const __m128 xmm_scalar = _mm_set1_ps(scalar); | |
287 | 2 | const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); | |
288 | |||
289 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (unsigned i = 0; i < num_points / 4; ++i) { |
290 | 65534 | __m128 xmm_points0 = _mm_load_ps((float*)points); | |
291 | 65534 | __m128 xmm_points1 = _mm_load_ps((float*)(points + 2)); | |
292 | 65534 | points += 4; | |
293 | 65534 | __m128 xmm_result = _mm_scaled_norm_dist_ps_sse( | |
294 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
295 | _mm_store_ps((float*)target, xmm_result); | ||
296 | 65534 | target += 4; | |
297 | } | ||
298 | |||
299 | 2 | calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); | |
300 | 2 | } | |
301 | #endif // LV_HAVE_SSE | ||
302 | |||
303 | #ifdef LV_HAVE_GENERIC | ||
304 | static inline void | ||
305 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, | |
306 | lv_32fc_t* src0, | ||
307 | lv_32fc_t* points, | ||
308 | float scalar, | ||
309 | unsigned int num_points) | ||
310 | { | ||
311 | 2 | const lv_32fc_t symbol = *src0; | |
312 | 2 | calculate_scaled_distances(target, symbol, points, scalar, num_points); | |
313 | 2 | } | |
314 | |||
315 | #endif /*LV_HAVE_GENERIC*/ | ||
316 | |||
317 | |||
318 | #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/ | ||
319 | |||
320 | #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H | ||
321 | #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H | ||
322 | |||
323 | #include <volk/volk_complex.h> | ||
324 | |||
325 | |||
326 | #ifdef LV_HAVE_AVX2 | ||
327 | #include <immintrin.h> | ||
328 | #include <volk/volk_avx2_intrinsics.h> | ||
329 | |||
330 | static inline void | ||
331 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, | |
332 | lv_32fc_t* src0, | ||
333 | lv_32fc_t* points, | ||
334 | float scalar, | ||
335 | unsigned int num_points) | ||
336 | { | ||
337 | 2 | const unsigned int num_bytes = num_points * 8; | |
338 | __m128 xmm9, xmm10; | ||
339 | __m256 xmm4, xmm6; | ||
340 | __m256 xmm_points0, xmm_points1, xmm_result; | ||
341 | |||
342 | 2 | const unsigned int bound = num_bytes >> 6; | |
343 | |||
344 | // load complex value into all parts of the register. | ||
345 | 2 | const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); | |
346 | 2 | const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); | |
347 | |||
348 | // Load scalar into all 8 parts of the register | ||
349 | 2 | const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); | |
350 | 2 | const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); | |
351 | |||
352 | // Set permutation constant | ||
353 | 2 | const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
354 | |||
355 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (unsigned int i = 0; i < bound; ++i) { |
356 | 32766 | xmm_points0 = _mm256_loadu_ps((float*)points); | |
357 | 32766 | xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); | |
358 | 32766 | points += 8; | |
359 | 32766 | __VOLK_PREFETCH(points); | |
360 | |||
361 | 32766 | xmm_result = _mm256_scaled_norm_dist_ps_avx2( | |
362 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
363 | |||
364 | _mm256_storeu_ps(target, xmm_result); | ||
365 | 32766 | target += 8; | |
366 | } | ||
367 | |||
368 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 5 & 1) { |
369 | 2 | xmm_points0 = _mm256_loadu_ps((float*)points); | |
370 | |||
371 | 2 | xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); | |
372 | |||
373 | 2 | points += 4; | |
374 | |||
375 | 2 | xmm6 = _mm256_mul_ps(xmm4, xmm4); | |
376 | |||
377 | 2 | xmm4 = _mm256_hadd_ps(xmm6, xmm6); | |
378 | 2 | xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); | |
379 | |||
380 | 2 | xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); | |
381 | |||
382 | 2 | xmm9 = _mm256_extractf128_ps(xmm_result, 1); | |
383 | _mm_storeu_ps(target, xmm9); | ||
384 | 2 | target += 4; | |
385 | } | ||
386 | |||
387 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 4 & 1) { |
388 | 2 | xmm9 = _mm_loadu_ps((float*)points); | |
389 | |||
390 | 2 | xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); | |
391 | |||
392 | 2 | points += 2; | |
393 | |||
394 | 2 | xmm9 = _mm_mul_ps(xmm10, xmm10); | |
395 | |||
396 | 2 | xmm10 = _mm_hadd_ps(xmm9, xmm9); | |
397 | |||
398 | 2 | xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); | |
399 | |||
400 | _mm_storeh_pi((__m64*)target, xmm10); | ||
401 | 2 | target += 2; | |
402 | } | ||
403 | |||
404 | 2 | calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); | |
405 | 2 | } | |
406 | |||
407 | #endif /*LV_HAVE_AVX2*/ | ||
408 | |||
409 | |||
410 | #ifdef LV_HAVE_AVX | ||
411 | #include <immintrin.h> | ||
412 | #include <volk/volk_avx_intrinsics.h> | ||
413 | |||
414 | static inline void | ||
415 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target, | |
416 | lv_32fc_t* src0, | ||
417 | lv_32fc_t* points, | ||
418 | float scalar, | ||
419 | unsigned int num_points) | ||
420 | { | ||
421 | 2 | const int eightsPoints = num_points / 8; | |
422 | 2 | const int remainder = num_points - 8 * eightsPoints; | |
423 | |||
424 | __m256 xmm_points0, xmm_points1, xmm_result; | ||
425 | |||
426 | // load complex value into all parts of the register. | ||
427 | 4 | const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); | |
428 | |||
429 | // Load scalar into all 8 parts of the register | ||
430 | 2 | const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); | |
431 | |||
432 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (int i = 0; i < eightsPoints; ++i) { |
433 | 32766 | xmm_points0 = _mm256_loadu_ps((float*)points); | |
434 | 32766 | xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); | |
435 | 32766 | points += 8; | |
436 | |||
437 | 32766 | xmm_result = _mm256_scaled_norm_dist_ps( | |
438 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
439 | |||
440 | _mm256_storeu_ps(target, xmm_result); | ||
441 | 32766 | target += 8; | |
442 | } | ||
443 | |||
444 | 2 | const lv_32fc_t symbol = *src0; | |
445 | 2 | calculate_scaled_distances(target, symbol, points, scalar, remainder); | |
446 | 2 | } | |
447 | |||
448 | #endif /* LV_HAVE_AVX */ | ||
449 | |||
450 | |||
451 | #ifdef LV_HAVE_SSE3 | ||
452 | #include <pmmintrin.h> | ||
453 | #include <volk/volk_sse3_intrinsics.h> | ||
454 | |||
455 | static inline void | ||
456 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, | |
457 | lv_32fc_t* src0, | ||
458 | lv_32fc_t* points, | ||
459 | float scalar, | ||
460 | unsigned int num_points) | ||
461 | { | ||
462 | __m128 xmm_points0, xmm_points1, xmm_result; | ||
463 | |||
464 | /* | ||
465 | * First do 4 values in every loop iteration. | ||
466 | * There may be up to 3 values left. | ||
467 | * leftovers0 indicates if at least 2 more are available for SSE execution. | ||
468 | * leftovers1 indicates if there is a single element left. | ||
469 | */ | ||
470 | 2 | const int quarterPoints = num_points / 4; | |
471 | 2 | const int leftovers0 = (num_points / 2) - 2 * quarterPoints; | |
472 | 2 | const int leftovers1 = num_points % 2; | |
473 | |||
474 | // load complex value into both parts of the register. | ||
475 | 4 | const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); | |
476 | |||
477 | // Load scalar into all 4 parts of the register | ||
478 | 2 | const __m128 xmm_scalar = _mm_load1_ps(&scalar); | |
479 | |||
480 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (int i = 0; i < quarterPoints; ++i) { |
481 | 65534 | xmm_points0 = _mm_loadu_ps((float*)points); | |
482 | 65534 | xmm_points1 = _mm_loadu_ps((float*)(points + 2)); | |
483 | 65534 | points += 4; | |
484 | 65534 | __VOLK_PREFETCH(points); | |
485 | // calculate distances | ||
486 | 65534 | xmm_result = _mm_scaled_norm_dist_ps_sse3( | |
487 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
488 | |||
489 | _mm_storeu_ps(target, xmm_result); | ||
490 | 65534 | target += 4; | |
491 | } | ||
492 | |||
493 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (int i = 0; i < leftovers0; ++i) { |
494 | 2 | xmm_points0 = _mm_loadu_ps((float*)points); | |
495 | 2 | points += 2; | |
496 | |||
497 | 2 | xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); | |
498 | 2 | xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); | |
499 | 2 | xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); | |
500 | 2 | xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); | |
501 | |||
502 | _mm_storeh_pi((__m64*)target, xmm_result); | ||
503 | 2 | target += 2; | |
504 | } | ||
505 | |||
506 | 2 | calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); | |
507 | 2 | } | |
508 | |||
509 | #endif /*LV_HAVE_SSE3*/ | ||
510 | |||
511 | #ifdef LV_HAVE_SSE | ||
512 | #include <volk/volk_sse_intrinsics.h> | ||
513 | #include <xmmintrin.h> | ||
514 | static inline void | ||
515 | 2 | volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, | |
516 | lv_32fc_t* src0, | ||
517 | lv_32fc_t* points, | ||
518 | float scalar, | ||
519 | unsigned int num_points) | ||
520 | { | ||
521 | 2 | const __m128 xmm_scalar = _mm_set1_ps(scalar); | |
522 | 2 | const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); | |
523 | |||
524 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (unsigned i = 0; i < num_points / 4; ++i) { |
525 | 65534 | __m128 xmm_points0 = _mm_loadu_ps((float*)points); | |
526 | 65534 | __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2)); | |
527 | 65534 | points += 4; | |
528 | 65534 | __m128 xmm_result = _mm_scaled_norm_dist_ps_sse( | |
529 | xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); | ||
530 | _mm_storeu_ps((float*)target, xmm_result); | ||
531 | 65534 | target += 4; | |
532 | } | ||
533 | |||
534 | 2 | calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); | |
535 | 2 | } | |
536 | #endif // LV_HAVE_SSE | ||
537 | |||
538 | #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/ | ||
539 |