Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_square_dist_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Calculates the square distance between a single complex input for each | ||
16 | * point in a complex vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, | ||
21 | * unsigned int num_points) { \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li src0: The complex input. Only the first point is used. | ||
25 | * \li points: A complex vector of reference points. | ||
26 | * \li num_points: The number of data points. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li target: A vector of distances between src0 and the vector of points. | ||
30 | * | ||
31 | * \b Example | ||
32 | * Calculate the distance between an input and reference points in a square | ||
33 | * 16-qam constellation. | ||
34 | * \code | ||
35 | * int N = 16; | ||
36 | * unsigned int alignment = volk_get_alignment(); | ||
37 | * lv_32fc_t* constellation = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
38 | * lv_32fc_t* rx = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
39 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
40 | * float const_vals[] = {-3, -1, 1, 3}; | ||
41 | * | ||
42 | * // Generate 16-QAM constellation points | ||
43 | * unsigned int jj = 0; | ||
44 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
45 | * constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]); | ||
46 | * if((ii+1)%4 == 0) ++jj; | ||
47 | * } | ||
48 | * | ||
49 | * *rx = lv_cmake(0.5f, 2.f); | ||
50 | * | ||
51 | * volk_32fc_x2_square_dist_32f(out, rx, constellation, N); | ||
52 | * | ||
53 | * printf("Distance from each constellation point:\n"); | ||
54 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
55 | * printf("%.4f ", out[ii]); | ||
56 | * if((ii+1)%4 == 0) printf("\n"); | ||
57 | * } | ||
58 | * | ||
59 | * volk_free(rx); | ||
60 | * volk_free(constellation); | ||
61 | * volk_free(out); | ||
62 | * \endcode | ||
63 | */ | ||
64 | |||
65 | #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H | ||
66 | #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H | ||
67 | |||
68 | #include <inttypes.h> | ||
69 | #include <stdio.h> | ||
70 | #include <volk/volk_complex.h> | ||
71 | |||
72 | #ifdef LV_HAVE_AVX2 | ||
73 | #include <immintrin.h> | ||
74 | |||
75 | 2 | static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target, | |
76 | lv_32fc_t* src0, | ||
77 | lv_32fc_t* points, | ||
78 | unsigned int num_points) | ||
79 | { | ||
80 | 2 | const unsigned int num_bytes = num_points * 8; | |
81 | __m128 xmm0, xmm9, xmm10; | ||
82 | __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; | ||
83 | |||
84 | lv_32fc_t diff; | ||
85 | float sq_dist; | ||
86 | 2 | int bound = num_bytes >> 6; | |
87 | 2 | int leftovers0 = (num_bytes >> 5) & 1; | |
88 | 2 | int leftovers1 = (num_bytes >> 4) & 1; | |
89 | 2 | int leftovers2 = (num_bytes >> 3) & 1; | |
90 | 2 | int i = 0; | |
91 | |||
92 | 2 | __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
93 | 2 | xmm1 = _mm256_setzero_ps(); | |
94 | 2 | xmm0 = _mm_load_ps((float*)src0); | |
95 | 2 | xmm0 = _mm_permute_ps(xmm0, 0b01000100); | |
96 | 2 | xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); | |
97 | 2 | xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); | |
98 | |||
99 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; i < bound; ++i) { |
100 | 32766 | xmm2 = _mm256_load_ps((float*)&points[0]); | |
101 | 32766 | xmm3 = _mm256_load_ps((float*)&points[4]); | |
102 | 32766 | points += 8; | |
103 | |||
104 | 32766 | xmm4 = _mm256_sub_ps(xmm1, xmm2); | |
105 | 32766 | xmm5 = _mm256_sub_ps(xmm1, xmm3); | |
106 | 32766 | xmm6 = _mm256_mul_ps(xmm4, xmm4); | |
107 | 32766 | xmm7 = _mm256_mul_ps(xmm5, xmm5); | |
108 | |||
109 | 32766 | xmm4 = _mm256_hadd_ps(xmm6, xmm7); | |
110 | 32766 | xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); | |
111 | |||
112 | _mm256_store_ps(target, xmm4); | ||
113 | |||
114 | 32766 | target += 8; | |
115 | } | ||
116 | |||
117 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (i = 0; i < leftovers0; ++i) { |
118 | |||
119 | 2 | xmm2 = _mm256_load_ps((float*)&points[0]); | |
120 | |||
121 | 2 | xmm4 = _mm256_sub_ps(xmm1, xmm2); | |
122 | |||
123 | 2 | points += 4; | |
124 | |||
125 | 2 | xmm6 = _mm256_mul_ps(xmm4, xmm4); | |
126 | |||
127 | 2 | xmm4 = _mm256_hadd_ps(xmm6, xmm6); | |
128 | 2 | xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); | |
129 | |||
130 | 2 | xmm9 = _mm256_extractf128_ps(xmm4, 1); | |
131 | _mm_store_ps(target, xmm9); | ||
132 | |||
133 | 2 | target += 4; | |
134 | } | ||
135 | |||
136 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (i = 0; i < leftovers1; ++i) { |
137 | 2 | xmm9 = _mm_load_ps((float*)&points[0]); | |
138 | |||
139 | 2 | xmm10 = _mm_sub_ps(xmm0, xmm9); | |
140 | |||
141 | 2 | points += 2; | |
142 | |||
143 | 2 | xmm9 = _mm_mul_ps(xmm10, xmm10); | |
144 | |||
145 | 2 | xmm10 = _mm_hadd_ps(xmm9, xmm9); | |
146 | |||
147 | _mm_storeh_pi((__m64*)target, xmm10); | ||
148 | |||
149 | 2 | target += 2; | |
150 | } | ||
151 | |||
152 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (i = 0; i < leftovers2; ++i) { |
153 | |||
154 | 2 | diff = src0[0] - points[0]; | |
155 | |||
156 | 2 | sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); | |
157 | |||
158 | 2 | target[0] = sq_dist; | |
159 | } | ||
160 | 2 | } | |
161 | |||
162 | #endif /*LV_HAVE_AVX2*/ | ||
163 | |||
164 | #ifdef LV_HAVE_SSE3 | ||
165 | #include <pmmintrin.h> | ||
166 | #include <xmmintrin.h> | ||
167 | |||
168 | 2 | static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, | |
169 | lv_32fc_t* src0, | ||
170 | lv_32fc_t* points, | ||
171 | unsigned int num_points) | ||
172 | { | ||
173 | 2 | const unsigned int num_bytes = num_points * 8; | |
174 | |||
175 | __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; | ||
176 | |||
177 | lv_32fc_t diff; | ||
178 | float sq_dist; | ||
179 | 2 | int bound = num_bytes >> 5; | |
180 | 2 | int i = 0; | |
181 | |||
182 | 2 | xmm1 = _mm_setzero_ps(); | |
183 | 2 | xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); | |
184 | 2 | xmm1 = _mm_movelh_ps(xmm1, xmm1); | |
185 | |||
186 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; i < bound; ++i) { |
187 | 65534 | xmm2 = _mm_load_ps((float*)&points[0]); | |
188 | 65534 | xmm4 = _mm_sub_ps(xmm1, xmm2); | |
189 | 131068 | xmm3 = _mm_load_ps((float*)&points[2]); | |
190 | 65534 | xmm5 = _mm_sub_ps(xmm1, xmm3); | |
191 | |||
192 | 65534 | xmm6 = _mm_mul_ps(xmm4, xmm4); | |
193 | 65534 | xmm7 = _mm_mul_ps(xmm5, xmm5); | |
194 | |||
195 | 65534 | xmm4 = _mm_hadd_ps(xmm6, xmm7); | |
196 | |||
197 | _mm_store_ps(target, xmm4); | ||
198 | |||
199 | 65534 | points += 4; | |
200 | 65534 | target += 4; | |
201 | } | ||
202 | |||
203 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 4 & 1) { |
204 | |||
205 | 2 | xmm2 = _mm_load_ps((float*)&points[0]); | |
206 | |||
207 | 2 | xmm4 = _mm_sub_ps(xmm1, xmm2); | |
208 | |||
209 | 2 | points += 2; | |
210 | |||
211 | 2 | xmm6 = _mm_mul_ps(xmm4, xmm4); | |
212 | |||
213 | 2 | xmm4 = _mm_hadd_ps(xmm6, xmm6); | |
214 | |||
215 | _mm_storeh_pi((__m64*)target, xmm4); | ||
216 | |||
217 | 2 | target += 2; | |
218 | } | ||
219 | |||
220 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 3 & 1) { |
221 | |||
222 | 2 | diff = src0[0] - points[0]; | |
223 | |||
224 | 2 | sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); | |
225 | |||
226 | 2 | target[0] = sq_dist; | |
227 | } | ||
228 | 2 | } | |
229 | |||
230 | #endif /*LV_HAVE_SSE3*/ | ||
231 | |||
232 | |||
233 | #ifdef LV_HAVE_NEON | ||
234 | #include <arm_neon.h> | ||
235 | static inline void volk_32fc_x2_square_dist_32f_neon(float* target, | ||
236 | lv_32fc_t* src0, | ||
237 | lv_32fc_t* points, | ||
238 | unsigned int num_points) | ||
239 | { | ||
240 | const unsigned int quarter_points = num_points / 4; | ||
241 | unsigned int number; | ||
242 | |||
243 | float32x4x2_t a_vec, b_vec; | ||
244 | float32x4x2_t diff_vec; | ||
245 | float32x4_t tmp, tmp1, dist_sq; | ||
246 | a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0])); | ||
247 | a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0])); | ||
248 | for (number = 0; number < quarter_points; ++number) { | ||
249 | b_vec = vld2q_f32((float*)points); | ||
250 | diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]); | ||
251 | diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]); | ||
252 | tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]); | ||
253 | tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]); | ||
254 | |||
255 | dist_sq = vaddq_f32(tmp, tmp1); | ||
256 | vst1q_f32(target, dist_sq); | ||
257 | points += 4; | ||
258 | target += 4; | ||
259 | } | ||
260 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
261 | lv_32fc_t diff = src0[0] - *points++; | ||
262 | *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); | ||
263 | } | ||
264 | } | ||
265 | #endif /* LV_HAVE_NEON */ | ||
266 | |||
267 | |||
268 | #ifdef LV_HAVE_GENERIC | ||
269 | 2 | static inline void volk_32fc_x2_square_dist_32f_generic(float* target, | |
270 | lv_32fc_t* src0, | ||
271 | lv_32fc_t* points, | ||
272 | unsigned int num_points) | ||
273 | { | ||
274 | 2 | const unsigned int num_bytes = num_points * 8; | |
275 | |||
276 | lv_32fc_t diff; | ||
277 | float sq_dist; | ||
278 | 2 | unsigned int i = 0; | |
279 | |||
280 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; i<num_bytes>> 3; ++i) { |
281 | 262142 | diff = src0[0] - points[i]; | |
282 | |||
283 | 262142 | sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); | |
284 | |||
285 | 262142 | target[i] = sq_dist; | |
286 | } | ||
287 | 2 | } | |
288 | |||
289 | #endif /*LV_HAVE_GENERIC*/ | ||
290 | |||
291 | |||
292 | #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/ | ||
293 | |||
294 | #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H | ||
295 | #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H | ||
296 | |||
297 | #include <inttypes.h> | ||
298 | #include <stdio.h> | ||
299 | #include <volk/volk_complex.h> | ||
300 | |||
301 | #ifdef LV_HAVE_AVX2 | ||
302 | #include <immintrin.h> | ||
303 | |||
304 | 2 | static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, | |
305 | lv_32fc_t* src0, | ||
306 | lv_32fc_t* points, | ||
307 | unsigned int num_points) | ||
308 | { | ||
309 | 2 | const unsigned int num_bytes = num_points * 8; | |
310 | __m128 xmm0, xmm9; | ||
311 | __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; | ||
312 | |||
313 | lv_32fc_t diff; | ||
314 | float sq_dist; | ||
315 | 2 | int bound = num_bytes >> 6; | |
316 | 2 | int leftovers1 = (num_bytes >> 3) & 0b11; | |
317 | 2 | int i = 0; | |
318 | |||
319 | 2 | __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); | |
320 | 2 | xmm1 = _mm256_setzero_ps(); | |
321 | 2 | xmm0 = _mm_loadu_ps((float*)src0); | |
322 | 2 | xmm0 = _mm_permute_ps(xmm0, 0b01000100); | |
323 | 2 | xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); | |
324 | 2 | xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); | |
325 | |||
326 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; i < bound; ++i) { |
327 | 32766 | xmm2 = _mm256_loadu_ps((float*)&points[0]); | |
328 | 32766 | xmm3 = _mm256_loadu_ps((float*)&points[4]); | |
329 | 32766 | points += 8; | |
330 | |||
331 | 32766 | xmm4 = _mm256_sub_ps(xmm1, xmm2); | |
332 | 32766 | xmm5 = _mm256_sub_ps(xmm1, xmm3); | |
333 | 32766 | xmm6 = _mm256_mul_ps(xmm4, xmm4); | |
334 | 32766 | xmm7 = _mm256_mul_ps(xmm5, xmm5); | |
335 | |||
336 | 32766 | xmm4 = _mm256_hadd_ps(xmm6, xmm7); | |
337 | 32766 | xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); | |
338 | |||
339 | _mm256_storeu_ps(target, xmm4); | ||
340 | |||
341 | 32766 | target += 8; | |
342 | } | ||
343 | |||
344 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 5 & 1) { |
345 | |||
346 | 2 | xmm2 = _mm256_loadu_ps((float*)&points[0]); | |
347 | |||
348 | 2 | xmm4 = _mm256_sub_ps(xmm1, xmm2); | |
349 | |||
350 | 2 | points += 4; | |
351 | |||
352 | 2 | xmm6 = _mm256_mul_ps(xmm4, xmm4); | |
353 | |||
354 | 2 | xmm4 = _mm256_hadd_ps(xmm6, xmm6); | |
355 | 2 | xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); | |
356 | |||
357 | 2 | xmm9 = _mm256_extractf128_ps(xmm4, 1); | |
358 | _mm_storeu_ps(target, xmm9); | ||
359 | |||
360 | 2 | target += 4; | |
361 | } | ||
362 | |||
363 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (i = 0; i < leftovers1; ++i) { |
364 | |||
365 | 6 | diff = src0[0] - points[0]; | |
366 | 6 | points += 1; | |
367 | |||
368 | 6 | sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); | |
369 | |||
370 | 6 | target[0] = sq_dist; | |
371 | 6 | target += 1; | |
372 | } | ||
373 | 2 | } | |
374 | |||
375 | #endif /*LV_HAVE_AVX2*/ | ||
376 | |||
377 | #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/ | ||
378 |