GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x3_sum_of_poly_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 243 243 100.0%
Functions: 6 6 100.0%
Branches: 42 42 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x3_sum_of_poly_32f
12 *
13 * \b Overview
14 *
15 * Calculates the unscaled area under a fourth order polynomial using the
16 * rectangular method. The result is the sum of y-values. To get the area,
17 * multiply by the rectangle/bin width.
18 *
19 * Expressed as a formula, this function calculates
20 * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot
21 * x^4)\f$
22 *
23 * <b>Dispatcher Prototype</b>
24 * \code
25 * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array,
26 * float* cutoff, unsigned int num_points) \endcode
27 *
28 * \b Inputs
29 * \li src0: x values
30 * \li center_point_array: polynomial coefficients in order {c1, c2, c3, c4, c0}
31 * \li cutoff: the minimum x value to use (will clamp to cutoff if input < cutoff)
32 * \li num_points: The number of values in both input vectors.
33 *
34 * \b Outputs
35 * \li complexVector: The sum of y values generated by polynomial.
36 *
37 * \b Example
38 * The following estimates \f$\int_0^\pi e^x dx\f$ by using the Taylor expansion
39 * centered around \f$x=1.5\f$,
40 * \f$ e^x = e^1.5 * (1 + x + \frac{1}{2} x^2 + \frac{1}{6}x^3 + \frac{1}{24}x^4) \f$
41 * \code
42 * int npoints = 4096;
43 * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
44 * float* input = (float*)volk_malloc(sizeof(float) * npoints,
45 * volk_get_alignment()); float* result = (float*)volk_malloc(sizeof(float),
46 * volk_get_alignment()); float* cutoff = (float*)volk_malloc(sizeof(float),
47 * volk_get_alignment());
48 * // load precomputed Taylor series coefficients
49 * coefficients[0] = 4.48168907033806f; // c1
50 * coefficients[1] = coefficients[0] * 0.5f; // c2
51 * coefficients[2] = coefficients[0] * 1.0f/6.0f; // c3
52 * coefficients[3] = coefficients[0] * 1.0f/24.0f; // c4
53 * coefficients[4] = coefficients[0]; // c0
54 * *cutoff = -2.0;
55 * *result = 0.0f;
56 * // generate uniform input data
57 * float dx = (float)M_PI/ (float)npoints;
58 * for(unsigned int ii=0; ii < npoints; ++ii){
59 * input[ii] = dx * (float)ii - 1.5f;
60 * }
61 * volk_32f_x3_sum_of_poly_32f(result, input, coefficients, cutoff, npoints);
62 * // multiply by bin width to get approximate area
63 * std::cout << "result is " << *result * (input[1]-input[0]) << std::endl;
64 * volk_free(coefficients);
65 * volk_free(input);
66 * volk_free(result);
67 * volk_free(cutoff);
68 * \endcode
69 */
70
71 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
72 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
73
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <volk/volk_complex.h>
77
78 #ifndef MAX
79 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
80 #endif
81
82 #ifdef LV_HAVE_SSE3
83 #include <pmmintrin.h>
84 #include <xmmintrin.h>
85
86 2 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
87 float* src0,
88 float* center_point_array,
89 float* cutoff,
90 unsigned int num_points)
91 {
92 2 float result = 0.0f;
93 2 float fst = 0.0f;
94 2 float sq = 0.0f;
95 2 float thrd = 0.0f;
96 2 float frth = 0.0f;
97
98 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
99
100 2 xmm9 = _mm_setzero_ps();
101 2 xmm1 = _mm_setzero_ps();
102 2 xmm0 = _mm_load1_ps(&center_point_array[0]);
103 2 xmm6 = _mm_load1_ps(&center_point_array[1]);
104 2 xmm7 = _mm_load1_ps(&center_point_array[2]);
105 4 xmm8 = _mm_load1_ps(&center_point_array[3]);
106 2 xmm10 = _mm_load1_ps(cutoff);
107
108 2 int bound = num_points / 8;
109 2 int leftovers = num_points - 8 * bound;
110 2 int i = 0;
111
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; i < bound; ++i) {
112 // 1st
113 32766 xmm2 = _mm_load_ps(src0);
114 32766 xmm2 = _mm_max_ps(xmm10, xmm2);
115 32766 xmm3 = _mm_mul_ps(xmm2, xmm2);
116 32766 xmm4 = _mm_mul_ps(xmm2, xmm3);
117 32766 xmm5 = _mm_mul_ps(xmm3, xmm3);
118
119 32766 xmm2 = _mm_mul_ps(xmm2, xmm0);
120 32766 xmm3 = _mm_mul_ps(xmm3, xmm6);
121 32766 xmm4 = _mm_mul_ps(xmm4, xmm7);
122 32766 xmm5 = _mm_mul_ps(xmm5, xmm8);
123
124 32766 xmm2 = _mm_add_ps(xmm2, xmm3);
125 32766 xmm3 = _mm_add_ps(xmm4, xmm5);
126
127 32766 src0 += 4;
128
129 32766 xmm9 = _mm_add_ps(xmm2, xmm9);
130 32766 xmm9 = _mm_add_ps(xmm3, xmm9);
131
132 // 2nd
133 32766 xmm2 = _mm_load_ps(src0);
134 32766 xmm2 = _mm_max_ps(xmm10, xmm2);
135 32766 xmm3 = _mm_mul_ps(xmm2, xmm2);
136 32766 xmm4 = _mm_mul_ps(xmm2, xmm3);
137 32766 xmm5 = _mm_mul_ps(xmm3, xmm3);
138
139 32766 xmm2 = _mm_mul_ps(xmm2, xmm0);
140 32766 xmm3 = _mm_mul_ps(xmm3, xmm6);
141 32766 xmm4 = _mm_mul_ps(xmm4, xmm7);
142 32766 xmm5 = _mm_mul_ps(xmm5, xmm8);
143
144 32766 xmm2 = _mm_add_ps(xmm2, xmm3);
145 32766 xmm3 = _mm_add_ps(xmm4, xmm5);
146
147 32766 src0 += 4;
148
149 32766 xmm1 = _mm_add_ps(xmm2, xmm1);
150 32766 xmm1 = _mm_add_ps(xmm3, xmm1);
151 }
152 2 xmm2 = _mm_hadd_ps(xmm9, xmm1);
153 2 xmm3 = _mm_hadd_ps(xmm2, xmm2);
154 2 xmm4 = _mm_hadd_ps(xmm3, xmm3);
155 2 _mm_store_ss(&result, xmm4);
156
157
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (i = 0; i < leftovers; ++i) {
158 14 fst = *src0++;
159
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
14 fst = MAX(fst, *cutoff);
160 14 sq = fst * fst;
161 14 thrd = fst * sq;
162 14 frth = sq * sq;
163 14 result += (center_point_array[0] * fst + center_point_array[1] * sq +
164 14 center_point_array[2] * thrd + center_point_array[3] * frth);
165 }
166
167 2 result += (float)(num_points)*center_point_array[4];
168 2 *target = result;
169 2 }
170
171
172 #endif /*LV_HAVE_SSE3*/
173
174 #if LV_HAVE_AVX && LV_HAVE_FMA
175 #include <immintrin.h>
176
177 2 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
178 float* src0,
179 float* center_point_array,
180 float* cutoff,
181 unsigned int num_points)
182 {
183 2 const unsigned int eighth_points = num_points / 8;
184 2 float fst = 0.0;
185 2 float sq = 0.0;
186 2 float thrd = 0.0;
187 2 float frth = 0.0;
188
189 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
190 __m256 target_vec;
191 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
192
193 2 cpa0 = _mm256_set1_ps(center_point_array[0]);
194 2 cpa1 = _mm256_set1_ps(center_point_array[1]);
195 2 cpa2 = _mm256_set1_ps(center_point_array[2]);
196 2 cpa3 = _mm256_set1_ps(center_point_array[3]);
197 4 cutoff_vec = _mm256_set1_ps(*cutoff);
198 2 target_vec = _mm256_setzero_ps();
199
200 unsigned int i;
201
202
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < eighth_points; ++i) {
203 32766 x_to_1 = _mm256_load_ps(src0);
204 32766 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
205 32766 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
206 32766 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
207 // x^1 * x^3 is slightly faster than x^2 * x^2
208 32766 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
209
210 32766 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
211 32766 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
212
213 32766 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
214 32766 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
215 // this is slightly faster than result += (x_to_1 + x_to_3)
216 32766 target_vec = _mm256_add_ps(x_to_1, target_vec);
217 32766 target_vec = _mm256_add_ps(x_to_3, target_vec);
218
219 32766 src0 += 8;
220 }
221
222 // the hadd for vector reduction has very very slight impact @ 50k iters
223 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
224 2 target_vec = _mm256_hadd_ps(
225 target_vec,
226 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
227 _mm256_store_ps(temp_results, target_vec);
228 2 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
229
230
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (i = eighth_points * 8; i < num_points; ++i) {
231 14 fst = *src0++;
232
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
14 fst = MAX(fst, *cutoff);
233 14 sq = fst * fst;
234 14 thrd = fst * sq;
235 14 frth = sq * sq;
236 14 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
237 14 center_point_array[2] * thrd + center_point_array[3] * frth);
238 }
239 2 *target += (float)(num_points)*center_point_array[4];
240 2 }
241 #endif // LV_HAVE_AVX && LV_HAVE_FMA
242
243 #ifdef LV_HAVE_AVX
244 #include <immintrin.h>
245
246 2 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
247 float* src0,
248 float* center_point_array,
249 float* cutoff,
250 unsigned int num_points)
251 {
252 2 const unsigned int eighth_points = num_points / 8;
253 2 float fst = 0.0;
254 2 float sq = 0.0;
255 2 float thrd = 0.0;
256 2 float frth = 0.0;
257
258 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
259 __m256 target_vec;
260 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
261
262 2 cpa0 = _mm256_set1_ps(center_point_array[0]);
263 2 cpa1 = _mm256_set1_ps(center_point_array[1]);
264 2 cpa2 = _mm256_set1_ps(center_point_array[2]);
265 2 cpa3 = _mm256_set1_ps(center_point_array[3]);
266 4 cutoff_vec = _mm256_set1_ps(*cutoff);
267 2 target_vec = _mm256_setzero_ps();
268
269 unsigned int i;
270
271
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < eighth_points; ++i) {
272 32766 x_to_1 = _mm256_load_ps(src0);
273 32766 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274 32766 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
275 32766 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
276 // x^1 * x^3 is slightly faster than x^2 * x^2
277 32766 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
278
279 32766 x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
280 32766 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
281 32766 x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
282 32766 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
283
284 32766 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285 32766 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
286 // this is slightly faster than result += (x_to_1 + x_to_3)
287 32766 target_vec = _mm256_add_ps(x_to_1, target_vec);
288 32766 target_vec = _mm256_add_ps(x_to_3, target_vec);
289
290 32766 src0 += 8;
291 }
292
293 // the hadd for vector reduction has very very slight impact @ 50k iters
294 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
295 2 target_vec = _mm256_hadd_ps(
296 target_vec,
297 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
298 _mm256_store_ps(temp_results, target_vec);
299 2 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
300
301
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (i = eighth_points * 8; i < num_points; ++i) {
302 14 fst = *src0++;
303
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
14 fst = MAX(fst, *cutoff);
304 14 sq = fst * fst;
305 14 thrd = fst * sq;
306 14 frth = sq * sq;
307 14 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
308 14 center_point_array[2] * thrd + center_point_array[3] * frth);
309 }
310 2 *target += (float)(num_points)*center_point_array[4];
311 2 }
312 #endif // LV_HAVE_AVX
313
314
315 #ifdef LV_HAVE_GENERIC
316
317 2 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
318 float* src0,
319 float* center_point_array,
320 float* cutoff,
321 unsigned int num_points)
322 {
323 2 const unsigned int eighth_points = num_points / 8;
324
325 2 float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
326 2 float fst = 0.0f;
327 2 float sq = 0.0f;
328 2 float thrd = 0.0f;
329 2 float frth = 0.0f;
330
331 2 unsigned int i = 0;
332 2 unsigned int k = 0;
333
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < eighth_points; ++i) {
334
2/2
✓ Branch 0 taken 262128 times.
✓ Branch 1 taken 32766 times.
294894 for (k = 0; k < 8; ++k) {
335 262128 fst = *src0++;
336
2/2
✓ Branch 0 taken 93053 times.
✓ Branch 1 taken 169075 times.
262128 fst = MAX(fst, *cutoff);
337 262128 sq = fst * fst;
338 262128 thrd = fst * sq;
339 262128 frth = fst * thrd;
340 262128 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
341 262128 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
342 }
343 }
344
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
10 for (k = 0; k < 8; k += 2)
345 8 result[k] = result[k] + result[k + 1];
346
347 2 *target = result[0] + result[2] + result[4] + result[6];
348
349
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (i = eighth_points * 8; i < num_points; ++i) {
350 14 fst = *src0++;
351
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
14 fst = MAX(fst, *cutoff);
352 14 sq = fst * fst;
353 14 thrd = fst * sq;
354 14 frth = fst * thrd;
355 14 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
356 14 center_point_array[2] * thrd + center_point_array[3] * frth);
357 }
358 2 *target += (float)(num_points)*center_point_array[4];
359 2 }
360
361 #endif /*LV_HAVE_GENERIC*/
362
363 #ifdef LV_HAVE_NEON
364 #include <arm_neon.h>
365
366 static inline void
367 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
368 float* __restrict src0,
369 float* __restrict center_point_array,
370 float* __restrict cutoff,
371 unsigned int num_points)
372 {
373 unsigned int i;
374 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
375
376 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
377 float32x2_t cutoff_vector;
378 float32x2x2_t x_low, x_high;
379 float32x4_t x_qvector, c_qvector, cpa_qvector;
380 float accumulator;
381 float res_accumulators[4];
382
383 c_qvector = vld1q_f32(zero);
384 // load the cutoff in to a vector
385 cutoff_vector = vdup_n_f32(*cutoff);
386 // ... center point array
387 cpa_qvector = vld1q_f32(center_point_array);
388
389 for (i = 0; i < num_points; ++i) {
390 // load x (src0)
391 x_to_1 = vdup_n_f32(*src0++);
392
393 // Get a vector of max(src0, cutoff)
394 x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
395 x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
396 x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
397 x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
398 // zip up doubles to interleave
399 x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
400 x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
401 // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
402 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
403 // now we finally have [x^4 | x^3 | x^2 | x] !
404
405 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
406 }
407 // there should be better vector reduction techniques
408 vst1q_f32(res_accumulators, c_qvector);
409 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
410 res_accumulators[3];
411
412 *target = accumulator + (float)num_points * center_point_array[4];
413 }
414
415 #endif /* LV_HAVE_NEON */
416
417
418 #ifdef LV_HAVE_NEON
419
420 static inline void
421 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
422 float* __restrict src0,
423 float* __restrict center_point_array,
424 float* __restrict cutoff,
425 unsigned int num_points)
426 {
427 unsigned int i;
428 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
429
430 float accumulator;
431
432 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
433 accumulator1_vec = vld1q_f32(zero);
434 accumulator2_vec = vld1q_f32(zero);
435 accumulator3_vec = vld1q_f32(zero);
436 accumulator4_vec = vld1q_f32(zero);
437 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
438 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
439
440 // load the cutoff in to a vector
441 cutoff_vector = vdupq_n_f32(*cutoff);
442 // ... center point array
443 cpa_0 = vdupq_n_f32(center_point_array[0]);
444 cpa_1 = vdupq_n_f32(center_point_array[1]);
445 cpa_2 = vdupq_n_f32(center_point_array[2]);
446 cpa_3 = vdupq_n_f32(center_point_array[3]);
447
448 // nathan is not sure why this is slower *and* wrong compared to neonvertfma
449 for (i = 0; i < num_points / 4; ++i) {
450 // load x
451 x_to_1 = vld1q_f32(src0);
452
453 // Get a vector of max(src0, cutoff)
454 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
455 x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
456 x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
457 x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
458 x_to_1 = vmulq_f32(x_to_1, cpa_0);
459 x_to_2 = vmulq_f32(x_to_2, cpa_1);
460 x_to_3 = vmulq_f32(x_to_3, cpa_2);
461 x_to_4 = vmulq_f32(x_to_4, cpa_3);
462 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
463 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
464 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
465 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
466
467 src0 += 4;
468 }
469 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
470 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
471 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
472
473 __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
474 vst1q_f32(res_accumulators, accumulator1_vec);
475 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
476 res_accumulators[3];
477
478 float fst = 0.0;
479 float sq = 0.0;
480 float thrd = 0.0;
481 float frth = 0.0;
482
483 for (i = 4 * (num_points / 4); i < num_points; ++i) {
484 fst = *src0++;
485 fst = MAX(fst, *cutoff);
486
487 sq = fst * fst;
488 thrd = fst * sq;
489 frth = sq * sq;
490 // fith = sq * thrd;
491
492 accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
493 center_point_array[2] * thrd + center_point_array[3] * frth); //+
494 }
495
496 *target = accumulator + (float)num_points * center_point_array[4];
497 }
498
499 #endif /* LV_HAVE_NEON */
500
501 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
502
503 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
504 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
505
506 #include <inttypes.h>
507 #include <stdio.h>
508 #include <volk/volk_complex.h>
509
510 #ifndef MAX
511 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
512 #endif
513
514 #if LV_HAVE_AVX && LV_HAVE_FMA
515 #include <immintrin.h>
516
517 2 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
518 float* src0,
519 float* center_point_array,
520 float* cutoff,
521 unsigned int num_points)
522 {
523 2 const unsigned int eighth_points = num_points / 8;
524 2 float fst = 0.0;
525 2 float sq = 0.0;
526 2 float thrd = 0.0;
527 2 float frth = 0.0;
528
529 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
530 __m256 target_vec;
531 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
532
533 2 cpa0 = _mm256_set1_ps(center_point_array[0]);
534 2 cpa1 = _mm256_set1_ps(center_point_array[1]);
535 2 cpa2 = _mm256_set1_ps(center_point_array[2]);
536 2 cpa3 = _mm256_set1_ps(center_point_array[3]);
537 4 cutoff_vec = _mm256_set1_ps(*cutoff);
538 2 target_vec = _mm256_setzero_ps();
539
540 unsigned int i;
541
542
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < eighth_points; ++i) {
543 32766 x_to_1 = _mm256_loadu_ps(src0);
544 32766 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
545 32766 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
546 32766 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
547 // x^1 * x^3 is slightly faster than x^2 * x^2
548 32766 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
549
550 32766 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
551 32766 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
552
553 32766 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
554 32766 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
555 // this is slightly faster than result += (x_to_1 + x_to_3)
556 32766 target_vec = _mm256_add_ps(x_to_1, target_vec);
557 32766 target_vec = _mm256_add_ps(x_to_3, target_vec);
558
559 32766 src0 += 8;
560 }
561
562 // the hadd for vector reduction has very very slight impact @ 50k iters
563 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
564 2 target_vec = _mm256_hadd_ps(
565 target_vec,
566 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
567 _mm256_storeu_ps(temp_results, target_vec);
568 2 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
569
570
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (i = eighth_points * 8; i < num_points; ++i) {
571 14 fst = *src0++;
572
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
14 fst = MAX(fst, *cutoff);
573 14 sq = fst * fst;
574 14 thrd = fst * sq;
575 14 frth = sq * sq;
576 14 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
577 14 center_point_array[2] * thrd + center_point_array[3] * frth);
578 }
579
580 2 *target += (float)(num_points)*center_point_array[4];
581 2 }
582 #endif // LV_HAVE_AVX && LV_HAVE_FMA
583
584 #ifdef LV_HAVE_AVX
585 #include <immintrin.h>
586
587 2 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
588 float* src0,
589 float* center_point_array,
590 float* cutoff,
591 unsigned int num_points)
592 {
593 2 const unsigned int eighth_points = num_points / 8;
594 2 float fst = 0.0;
595 2 float sq = 0.0;
596 2 float thrd = 0.0;
597 2 float frth = 0.0;
598
599 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
600 __m256 target_vec;
601 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
602
603 2 cpa0 = _mm256_set1_ps(center_point_array[0]);
604 2 cpa1 = _mm256_set1_ps(center_point_array[1]);
605 2 cpa2 = _mm256_set1_ps(center_point_array[2]);
606 2 cpa3 = _mm256_set1_ps(center_point_array[3]);
607 4 cutoff_vec = _mm256_set1_ps(*cutoff);
608 2 target_vec = _mm256_setzero_ps();
609
610 unsigned int i;
611
612
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (i = 0; i < eighth_points; ++i) {
613 32766 x_to_1 = _mm256_loadu_ps(src0);
614 32766 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
615 32766 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
616 32766 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
617 // x^1 * x^3 is slightly faster than x^2 * x^2
618 32766 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
619
620 32766 x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
621 32766 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
622 32766 x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
623 32766 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
624
625 32766 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
626 32766 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
627 // this is slightly faster than result += (x_to_1 + x_to_3)
628 32766 target_vec = _mm256_add_ps(x_to_1, target_vec);
629 32766 target_vec = _mm256_add_ps(x_to_3, target_vec);
630
631 32766 src0 += 8;
632 }
633
634 // the hadd for vector reduction has very very slight impact @ 50k iters
635 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
636 2 target_vec = _mm256_hadd_ps(
637 target_vec,
638 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
639 _mm256_storeu_ps(temp_results, target_vec);
640 2 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
641
642
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (i = eighth_points * 8; i < num_points; ++i) {
643 14 fst = *src0++;
644
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
14 fst = MAX(fst, *cutoff);
645 14 sq = fst * fst;
646 14 thrd = fst * sq;
647 14 frth = sq * sq;
648
649 14 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
650 14 center_point_array[2] * thrd + center_point_array[3] * frth);
651 }
652
653 2 *target += (float)(num_points)*center_point_array[4];
654 2 }
655 #endif // LV_HAVE_AVX
656
657 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
658