GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 176 176 100.0%
Functions: 10 10 100.0%
Branches: 26 30 86.7%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_x2_s32f_square_dist_scalar_mult_32f
12 *
13 * \b Overview
14 *
15 * Calculates the square distance between a single complex input for each
16 * point in a complex vector scaled by a scalar value.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0,
21 * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode
22 *
23 * \b Inputs
24 * \li src0: The complex input. Only the first point is used.
25 * \li points: A complex vector of reference points.
26 * \li scalar: A float to scale the distances by
27 * \li num_points: The number of data points.
28 *
29 * \b Outputs
30 * \li target: A vector of distances between src0 and the vector of points.
31 *
32 * \b Example
33 * Calculate the distance between an input and reference points in a square
34 * 16-qam constellation. Normalize distances by the area of the constellation.
35 * \code
36 * int N = 16;
37 * unsigned int alignment = volk_get_alignment();
38 * lv_32fc_t* constellation = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
39 * lv_32fc_t* rx = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
40 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
41 * float const_vals[] = {-3, -1, 1, 3};
42 *
43 * unsigned int jj = 0;
44 * for(unsigned int ii = 0; ii < N; ++ii){
45 * constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]);
46 * if((ii+1)%4 == 0) ++jj;
47 * }
48 *
49 * *rx = lv_cmake(0.5f, 2.f);
50 * float scale = 1.f/64.f; // 1 / constellation area
51 *
52 * volk_32fc_x2_s32f_square_dist_scalar_mult_32f(out, rx, constellation, scale, N);
53 *
54 * printf("Distance from each constellation point:\n");
55 * for(unsigned int ii = 0; ii < N; ++ii){
56 * printf("%.4f ", out[ii]);
57 * if((ii+1)%4 == 0) printf("\n");
58 * }
59 *
60 * volk_free(rx);
61 * volk_free(constellation);
62 * volk_free(out);
63 * \endcode
64 */
65
66 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
68
69 #include <volk/volk_complex.h>
70
71
72 18 static inline void calculate_scaled_distances(float* target,
73 const lv_32fc_t symbol,
74 const lv_32fc_t* points,
75 const float scalar,
76 const unsigned int num_points)
77 {
78 lv_32fc_t diff;
79
2/2
✓ Branch 0 taken 262190 times.
✓ Branch 1 taken 18 times.
262208 for (unsigned int i = 0; i < num_points; ++i) {
80 /*
81 * Calculate: |y - x|^2 * SNR_lin
82 * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
83 */
84 262190 diff = symbol - *points++;
85 262190 *target++ =
86 262190 scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
87 }
88 18 }
89
90
91 #ifdef LV_HAVE_AVX2
92 #include <immintrin.h>
93 #include <volk/volk_avx2_intrinsics.h>
94
95 static inline void
96 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
97 lv_32fc_t* src0,
98 lv_32fc_t* points,
99 float scalar,
100 unsigned int num_points)
101 {
102 2 const unsigned int num_bytes = num_points * 8;
103 __m128 xmm9, xmm10;
104 __m256 xmm4, xmm6;
105 __m256 xmm_points0, xmm_points1, xmm_result;
106
107 2 const unsigned int bound = num_bytes >> 6;
108
109 // load complex value into all parts of the register.
110 2 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
111 2 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
112
113 // Load scalar into all 8 parts of the register
114 2 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 2 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
116
117 // Set permutation constant
118 2 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
119
120
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned int i = 0; i < bound; ++i) {
121 32766 xmm_points0 = _mm256_load_ps((float*)points);
122 32766 xmm_points1 = _mm256_load_ps((float*)(points + 4));
123 32766 points += 8;
124 32766 __VOLK_PREFETCH(points);
125
126 32766 xmm_result = _mm256_scaled_norm_dist_ps_avx2(
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
128
129 _mm256_store_ps(target, xmm_result);
130 32766 target += 8;
131 }
132
133
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 5 & 1) {
134 2 xmm_points0 = _mm256_load_ps((float*)points);
135
136 2 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
137
138 2 points += 4;
139
140 2 xmm6 = _mm256_mul_ps(xmm4, xmm4);
141
142 2 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 2 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144
145 2 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
146
147 2 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148 _mm_store_ps(target, xmm9);
149 2 target += 4;
150 }
151
152
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 4 & 1) {
153 2 xmm9 = _mm_load_ps((float*)points);
154
155 2 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
156
157 2 points += 2;
158
159 2 xmm9 = _mm_mul_ps(xmm10, xmm10);
160
161 2 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162
163 2 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
164
165 _mm_storeh_pi((__m64*)target, xmm10);
166 2 target += 2;
167 }
168
169 2 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
170 2 }
171
172 #endif /*LV_HAVE_AVX2*/
173
174
175 #ifdef LV_HAVE_AVX
176 #include <immintrin.h>
177 #include <volk/volk_avx_intrinsics.h>
178
179 static inline void
180 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target,
181 lv_32fc_t* src0,
182 lv_32fc_t* points,
183 float scalar,
184 unsigned int num_points)
185 {
186 2 const int eightsPoints = num_points / 8;
187 2 const int remainder = num_points - 8 * eightsPoints;
188
189 __m256 xmm_points0, xmm_points1, xmm_result;
190
191 // load complex value into all parts of the register.
192 4 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
193
194 // Load scalar into all 8 parts of the register
195 2 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
196
197
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (int i = 0; i < eightsPoints; ++i) {
198 32766 xmm_points0 = _mm256_load_ps((float*)points);
199 32766 xmm_points1 = _mm256_load_ps((float*)(points + 4));
200 32766 points += 8;
201
202 32766 xmm_result = _mm256_scaled_norm_dist_ps(
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
204
205 _mm256_store_ps(target, xmm_result);
206 32766 target += 8;
207 }
208
209 2 const lv_32fc_t symbol = *src0;
210 2 calculate_scaled_distances(target, symbol, points, scalar, remainder);
211 2 }
212
213 #endif /* LV_HAVE_AVX */
214
215
216 #ifdef LV_HAVE_SSE3
217 #include <pmmintrin.h>
218 #include <volk/volk_sse3_intrinsics.h>
219
220 static inline void
221 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target,
222 lv_32fc_t* src0,
223 lv_32fc_t* points,
224 float scalar,
225 unsigned int num_points)
226 {
227 __m128 xmm_points0, xmm_points1, xmm_result;
228
229 /*
230 * First do 4 values in every loop iteration.
231 * There may be up to 3 values left.
232 * leftovers0 indicates if at least 2 more are available for SSE execution.
233 * leftovers1 indicates if there is a single element left.
234 */
235 2 const int quarterPoints = num_points / 4;
236 2 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 2 const int leftovers1 = num_points % 2;
238
239 // load complex value into both parts of the register.
240 4 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
241
242 // Load scalar into all 4 parts of the register
243 2 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
244
245
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (int i = 0; i < quarterPoints; ++i) {
246 65534 xmm_points0 = _mm_load_ps((float*)points);
247 65534 xmm_points1 = _mm_load_ps((float*)(points + 2));
248 65534 points += 4;
249 65534 __VOLK_PREFETCH(points);
250 // calculate distances
251 65534 xmm_result = _mm_scaled_norm_dist_ps_sse3(
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
253
254 _mm_store_ps(target, xmm_result);
255 65534 target += 4;
256 }
257
258
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (int i = 0; i < leftovers0; ++i) {
259 2 xmm_points0 = _mm_load_ps((float*)points);
260 2 points += 2;
261
262 2 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263 2 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264 2 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265 2 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
266
267 _mm_storeh_pi((__m64*)target, xmm_result);
268 2 target += 2;
269 }
270
271 2 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
272 2 }
273
274 #endif /*LV_HAVE_SSE3*/
275
276 #ifdef LV_HAVE_SSE
277 #include <volk/volk_sse_intrinsics.h>
278 #include <xmmintrin.h>
279 static inline void
280 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target,
281 lv_32fc_t* src0,
282 lv_32fc_t* points,
283 float scalar,
284 unsigned int num_points)
285 {
286 2 const __m128 xmm_scalar = _mm_set1_ps(scalar);
287 2 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
288
289
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (unsigned i = 0; i < num_points / 4; ++i) {
290 65534 __m128 xmm_points0 = _mm_load_ps((float*)points);
291 65534 __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
292 65534 points += 4;
293 65534 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295 _mm_store_ps((float*)target, xmm_result);
296 65534 target += 4;
297 }
298
299 2 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
300 2 }
301 #endif // LV_HAVE_SSE
302
303 #ifdef LV_HAVE_GENERIC
304 static inline void
305 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target,
306 lv_32fc_t* src0,
307 lv_32fc_t* points,
308 float scalar,
309 unsigned int num_points)
310 {
311 2 const lv_32fc_t symbol = *src0;
312 2 calculate_scaled_distances(target, symbol, points, scalar, num_points);
313 2 }
314
315 #endif /*LV_HAVE_GENERIC*/
316
317
318 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
319
320 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
322
323 #include <volk/volk_complex.h>
324
325
326 #ifdef LV_HAVE_AVX2
327 #include <immintrin.h>
328 #include <volk/volk_avx2_intrinsics.h>
329
330 static inline void
331 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
332 lv_32fc_t* src0,
333 lv_32fc_t* points,
334 float scalar,
335 unsigned int num_points)
336 {
337 2 const unsigned int num_bytes = num_points * 8;
338 __m128 xmm9, xmm10;
339 __m256 xmm4, xmm6;
340 __m256 xmm_points0, xmm_points1, xmm_result;
341
342 2 const unsigned int bound = num_bytes >> 6;
343
344 // load complex value into all parts of the register.
345 2 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
346 2 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
347
348 // Load scalar into all 8 parts of the register
349 2 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350 2 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
351
352 // Set permutation constant
353 2 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
354
355
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (unsigned int i = 0; i < bound; ++i) {
356 32766 xmm_points0 = _mm256_loadu_ps((float*)points);
357 32766 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
358 32766 points += 8;
359 32766 __VOLK_PREFETCH(points);
360
361 32766 xmm_result = _mm256_scaled_norm_dist_ps_avx2(
362 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
363
364 _mm256_storeu_ps(target, xmm_result);
365 32766 target += 8;
366 }
367
368
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 5 & 1) {
369 2 xmm_points0 = _mm256_loadu_ps((float*)points);
370
371 2 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
372
373 2 points += 4;
374
375 2 xmm6 = _mm256_mul_ps(xmm4, xmm4);
376
377 2 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 2 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
379
380 2 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
381
382 2 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383 _mm_storeu_ps(target, xmm9);
384 2 target += 4;
385 }
386
387
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 4 & 1) {
388 2 xmm9 = _mm_loadu_ps((float*)points);
389
390 2 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
391
392 2 points += 2;
393
394 2 xmm9 = _mm_mul_ps(xmm10, xmm10);
395
396 2 xmm10 = _mm_hadd_ps(xmm9, xmm9);
397
398 2 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
399
400 _mm_storeh_pi((__m64*)target, xmm10);
401 2 target += 2;
402 }
403
404 2 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
405 2 }
406
407 #endif /*LV_HAVE_AVX2*/
408
409
410 #ifdef LV_HAVE_AVX
411 #include <immintrin.h>
412 #include <volk/volk_avx_intrinsics.h>
413
414 static inline void
415 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target,
416 lv_32fc_t* src0,
417 lv_32fc_t* points,
418 float scalar,
419 unsigned int num_points)
420 {
421 2 const int eightsPoints = num_points / 8;
422 2 const int remainder = num_points - 8 * eightsPoints;
423
424 __m256 xmm_points0, xmm_points1, xmm_result;
425
426 // load complex value into all parts of the register.
427 4 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
428
429 // Load scalar into all 8 parts of the register
430 2 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
431
432
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (int i = 0; i < eightsPoints; ++i) {
433 32766 xmm_points0 = _mm256_loadu_ps((float*)points);
434 32766 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
435 32766 points += 8;
436
437 32766 xmm_result = _mm256_scaled_norm_dist_ps(
438 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
439
440 _mm256_storeu_ps(target, xmm_result);
441 32766 target += 8;
442 }
443
444 2 const lv_32fc_t symbol = *src0;
445 2 calculate_scaled_distances(target, symbol, points, scalar, remainder);
446 2 }
447
448 #endif /* LV_HAVE_AVX */
449
450
451 #ifdef LV_HAVE_SSE3
452 #include <pmmintrin.h>
453 #include <volk/volk_sse3_intrinsics.h>
454
455 static inline void
456 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target,
457 lv_32fc_t* src0,
458 lv_32fc_t* points,
459 float scalar,
460 unsigned int num_points)
461 {
462 __m128 xmm_points0, xmm_points1, xmm_result;
463
464 /*
465 * First do 4 values in every loop iteration.
466 * There may be up to 3 values left.
467 * leftovers0 indicates if at least 2 more are available for SSE execution.
468 * leftovers1 indicates if there is a single element left.
469 */
470 2 const int quarterPoints = num_points / 4;
471 2 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472 2 const int leftovers1 = num_points % 2;
473
474 // load complex value into both parts of the register.
475 4 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
476
477 // Load scalar into all 4 parts of the register
478 2 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
479
480
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (int i = 0; i < quarterPoints; ++i) {
481 65534 xmm_points0 = _mm_loadu_ps((float*)points);
482 65534 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
483 65534 points += 4;
484 65534 __VOLK_PREFETCH(points);
485 // calculate distances
486 65534 xmm_result = _mm_scaled_norm_dist_ps_sse3(
487 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
488
489 _mm_storeu_ps(target, xmm_result);
490 65534 target += 4;
491 }
492
493
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (int i = 0; i < leftovers0; ++i) {
494 2 xmm_points0 = _mm_loadu_ps((float*)points);
495 2 points += 2;
496
497 2 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
498 2 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
499 2 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
500 2 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
501
502 _mm_storeh_pi((__m64*)target, xmm_result);
503 2 target += 2;
504 }
505
506 2 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
507 2 }
508
509 #endif /*LV_HAVE_SSE3*/
510
511 #ifdef LV_HAVE_SSE
512 #include <volk/volk_sse_intrinsics.h>
513 #include <xmmintrin.h>
514 static inline void
515 2 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target,
516 lv_32fc_t* src0,
517 lv_32fc_t* points,
518 float scalar,
519 unsigned int num_points)
520 {
521 2 const __m128 xmm_scalar = _mm_set1_ps(scalar);
522 2 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
523
524
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (unsigned i = 0; i < num_points / 4; ++i) {
525 65534 __m128 xmm_points0 = _mm_loadu_ps((float*)points);
526 65534 __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
527 65534 points += 4;
528 65534 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
529 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
530 _mm_storeu_ps((float*)target, xmm_result);
531 65534 target += 4;
532 }
533
534 2 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
535 2 }
536 #endif // LV_HAVE_SSE
537
538 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
539