GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_x2_square_dist_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 120 120 100.0%
Functions: 4 4 100.0%
Branches: 19 22 86.4%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_x2_square_dist_32f
12 *
13 * \b Overview
14 *
15 * Calculates the square distance between a single complex input for each
16 * point in a complex vector.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points,
21 * unsigned int num_points) { \endcode
22 *
23 * \b Inputs
24 * \li src0: The complex input. Only the first point is used.
25 * \li points: A complex vector of reference points.
26 * \li num_points: The number of data points.
27 *
28 * \b Outputs
29 * \li target: A vector of distances between src0 and the vector of points.
30 *
31 * \b Example
32 * Calculate the distance between an input and reference points in a square
33 * 16-qam constellation.
34 * \code
35 * int N = 16;
36 * unsigned int alignment = volk_get_alignment();
37 * lv_32fc_t* constellation = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
38 * lv_32fc_t* rx = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
39 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
40 * float const_vals[] = {-3, -1, 1, 3};
41 *
42 * // Generate 16-QAM constellation points
43 * unsigned int jj = 0;
44 * for(unsigned int ii = 0; ii < N; ++ii){
45 * constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]);
46 * if((ii+1)%4 == 0) ++jj;
47 * }
48 *
49 * *rx = lv_cmake(0.5f, 2.f);
50 *
51 * volk_32fc_x2_square_dist_32f(out, rx, constellation, N);
52 *
53 * printf("Distance from each constellation point:\n");
54 * for(unsigned int ii = 0; ii < N; ++ii){
55 * printf("%.4f ", out[ii]);
56 * if((ii+1)%4 == 0) printf("\n");
57 * }
58 *
59 * volk_free(rx);
60 * volk_free(constellation);
61 * volk_free(out);
62 * \endcode
63 */
64
65 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
67
68 #include <inttypes.h>
69 #include <stdio.h>
70 #include <volk/volk_complex.h>
71
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
74
75 2 static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
76 lv_32fc_t* src0,
77 lv_32fc_t* points,
78 unsigned int num_points)
79 {
80 2 const unsigned int num_bytes = num_points * 8;
81 __m128 xmm0, xmm9, xmm10;
82 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83
84 lv_32fc_t diff;
85 float sq_dist;
86 2 int bound = num_bytes >> 6;
87 2 int leftovers0 = (num_bytes >> 5) & 1;
88 2 int leftovers1 = (num_bytes >> 4) & 1;
89 2 int leftovers2 = (num_bytes >> 3) & 1;
90 2 int i = 0;
91
92 2 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93 2 xmm1 = _mm256_setzero_ps();
94 2 xmm0 = _mm_load_ps((float*)src0);
95 2 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96 2 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97 2 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
98
99
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; i < bound; ++i) {
100 32766 xmm2 = _mm256_load_ps((float*)&points[0]);
101 32766 xmm3 = _mm256_load_ps((float*)&points[4]);
102 32766 points += 8;
103
104 32766 xmm4 = _mm256_sub_ps(xmm1, xmm2);
105 32766 xmm5 = _mm256_sub_ps(xmm1, xmm3);
106 32766 xmm6 = _mm256_mul_ps(xmm4, xmm4);
107 32766 xmm7 = _mm256_mul_ps(xmm5, xmm5);
108
109 32766 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110 32766 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
111
112 _mm256_store_ps(target, xmm4);
113
114 32766 target += 8;
115 }
116
117
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (i = 0; i < leftovers0; ++i) {
118
119 2 xmm2 = _mm256_load_ps((float*)&points[0]);
120
121 2 xmm4 = _mm256_sub_ps(xmm1, xmm2);
122
123 2 points += 4;
124
125 2 xmm6 = _mm256_mul_ps(xmm4, xmm4);
126
127 2 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128 2 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
129
130 2 xmm9 = _mm256_extractf128_ps(xmm4, 1);
131 _mm_store_ps(target, xmm9);
132
133 2 target += 4;
134 }
135
136
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (i = 0; i < leftovers1; ++i) {
137 2 xmm9 = _mm_load_ps((float*)&points[0]);
138
139 2 xmm10 = _mm_sub_ps(xmm0, xmm9);
140
141 2 points += 2;
142
143 2 xmm9 = _mm_mul_ps(xmm10, xmm10);
144
145 2 xmm10 = _mm_hadd_ps(xmm9, xmm9);
146
147 _mm_storeh_pi((__m64*)target, xmm10);
148
149 2 target += 2;
150 }
151
152
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (i = 0; i < leftovers2; ++i) {
153
154 2 diff = src0[0] - points[0];
155
156 2 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
157
158 2 target[0] = sq_dist;
159 }
160 2 }
161
162 #endif /*LV_HAVE_AVX2*/
163
164 #ifdef LV_HAVE_SSE3
165 #include <pmmintrin.h>
166 #include <xmmintrin.h>
167
168 2 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
169 lv_32fc_t* src0,
170 lv_32fc_t* points,
171 unsigned int num_points)
172 {
173 2 const unsigned int num_bytes = num_points * 8;
174
175 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
176
177 lv_32fc_t diff;
178 float sq_dist;
179 2 int bound = num_bytes >> 5;
180 2 int i = 0;
181
182 2 xmm1 = _mm_setzero_ps();
183 2 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184 2 xmm1 = _mm_movelh_ps(xmm1, xmm1);
185
186
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; i < bound; ++i) {
187 65534 xmm2 = _mm_load_ps((float*)&points[0]);
188 65534 xmm4 = _mm_sub_ps(xmm1, xmm2);
189 131068 xmm3 = _mm_load_ps((float*)&points[2]);
190 65534 xmm5 = _mm_sub_ps(xmm1, xmm3);
191
192 65534 xmm6 = _mm_mul_ps(xmm4, xmm4);
193 65534 xmm7 = _mm_mul_ps(xmm5, xmm5);
194
195 65534 xmm4 = _mm_hadd_ps(xmm6, xmm7);
196
197 _mm_store_ps(target, xmm4);
198
199 65534 points += 4;
200 65534 target += 4;
201 }
202
203
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 4 & 1) {
204
205 2 xmm2 = _mm_load_ps((float*)&points[0]);
206
207 2 xmm4 = _mm_sub_ps(xmm1, xmm2);
208
209 2 points += 2;
210
211 2 xmm6 = _mm_mul_ps(xmm4, xmm4);
212
213 2 xmm4 = _mm_hadd_ps(xmm6, xmm6);
214
215 _mm_storeh_pi((__m64*)target, xmm4);
216
217 2 target += 2;
218 }
219
220
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 3 & 1) {
221
222 2 diff = src0[0] - points[0];
223
224 2 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
225
226 2 target[0] = sq_dist;
227 }
228 2 }
229
230 #endif /*LV_HAVE_SSE3*/
231
232
233 #ifdef LV_HAVE_NEON
234 #include <arm_neon.h>
235 static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
236 lv_32fc_t* src0,
237 lv_32fc_t* points,
238 unsigned int num_points)
239 {
240 const unsigned int quarter_points = num_points / 4;
241 unsigned int number;
242
243 float32x4x2_t a_vec, b_vec;
244 float32x4x2_t diff_vec;
245 float32x4_t tmp, tmp1, dist_sq;
246 a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
247 a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
248 for (number = 0; number < quarter_points; ++number) {
249 b_vec = vld2q_f32((float*)points);
250 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
251 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
252 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
253 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
254
255 dist_sq = vaddq_f32(tmp, tmp1);
256 vst1q_f32(target, dist_sq);
257 points += 4;
258 target += 4;
259 }
260 for (number = quarter_points * 4; number < num_points; ++number) {
261 lv_32fc_t diff = src0[0] - *points++;
262 *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
263 }
264 }
265 #endif /* LV_HAVE_NEON */
266
267
268 #ifdef LV_HAVE_GENERIC
269 2 static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
270 lv_32fc_t* src0,
271 lv_32fc_t* points,
272 unsigned int num_points)
273 {
274 2 const unsigned int num_bytes = num_points * 8;
275
276 lv_32fc_t diff;
277 float sq_dist;
278 2 unsigned int i = 0;
279
280
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (; i<num_bytes>> 3; ++i) {
281 262142 diff = src0[0] - points[i];
282
283 262142 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
284
285 262142 target[i] = sq_dist;
286 }
287 2 }
288
289 #endif /*LV_HAVE_GENERIC*/
290
291
292 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
293
294 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
295 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
296
297 #include <inttypes.h>
298 #include <stdio.h>
299 #include <volk/volk_complex.h>
300
301 #ifdef LV_HAVE_AVX2
302 #include <immintrin.h>
303
304 2 static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
305 lv_32fc_t* src0,
306 lv_32fc_t* points,
307 unsigned int num_points)
308 {
309 2 const unsigned int num_bytes = num_points * 8;
310 __m128 xmm0, xmm9;
311 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
312
313 lv_32fc_t diff;
314 float sq_dist;
315 2 int bound = num_bytes >> 6;
316 2 int leftovers1 = (num_bytes >> 3) & 0b11;
317 2 int i = 0;
318
319 2 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
320 2 xmm1 = _mm256_setzero_ps();
321 2 xmm0 = _mm_loadu_ps((float*)src0);
322 2 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
323 2 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
324 2 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
325
326
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; i < bound; ++i) {
327 32766 xmm2 = _mm256_loadu_ps((float*)&points[0]);
328 32766 xmm3 = _mm256_loadu_ps((float*)&points[4]);
329 32766 points += 8;
330
331 32766 xmm4 = _mm256_sub_ps(xmm1, xmm2);
332 32766 xmm5 = _mm256_sub_ps(xmm1, xmm3);
333 32766 xmm6 = _mm256_mul_ps(xmm4, xmm4);
334 32766 xmm7 = _mm256_mul_ps(xmm5, xmm5);
335
336 32766 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
337 32766 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
338
339 _mm256_storeu_ps(target, xmm4);
340
341 32766 target += 8;
342 }
343
344
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (num_bytes >> 5 & 1) {
345
346 2 xmm2 = _mm256_loadu_ps((float*)&points[0]);
347
348 2 xmm4 = _mm256_sub_ps(xmm1, xmm2);
349
350 2 points += 4;
351
352 2 xmm6 = _mm256_mul_ps(xmm4, xmm4);
353
354 2 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
355 2 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
356
357 2 xmm9 = _mm256_extractf128_ps(xmm4, 1);
358 _mm_storeu_ps(target, xmm9);
359
360 2 target += 4;
361 }
362
363
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (i = 0; i < leftovers1; ++i) {
364
365 6 diff = src0[0] - points[0];
366 6 points += 1;
367
368 6 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
369
370 6 target[0] = sq_dist;
371 6 target += 1;
372 }
373 2 }
374
375 #endif /*LV_HAVE_AVX2*/
376
377 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
378