Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_x3_sum_of_poly_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Calculates the unscaled area under a fourth order polynomial using the | ||
16 | * rectangular method. The result is the sum of y-values. To get the area, | ||
17 | * multiply by the rectangle/bin width. | ||
18 | * | ||
19 | * Expressed as a formula, this function calculates | ||
20 | * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot | ||
21 | * x^4)\f$ | ||
22 | * | ||
23 | * <b>Dispatcher Prototype</b> | ||
24 | * \code | ||
25 | * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, | ||
26 | * float* cutoff, unsigned int num_points) \endcode | ||
27 | * | ||
28 | * \b Inputs | ||
29 | * \li src0: x values | ||
30 | * \li center_point_array: polynomial coefficients in order {c1, c2, c3, c4, c0} | ||
31 | * \li cutoff: the minimum x value to use (will clamp to cutoff if input < cutoff) | ||
32 | * \li num_points: The number of values in both input vectors. | ||
33 | * | ||
34 | * \b Outputs | ||
35 | * \li complexVector: The sum of y values generated by polynomial. | ||
36 | * | ||
37 | * \b Example | ||
38 | * The following estimates \f$\int_0^\pi e^x dx\f$ by using the Taylor expansion | ||
39 | * centered around \f$x=1.5\f$, | ||
40 | * \f$ e^x = e^1.5 * (1 + x + \frac{1}{2} x^2 + \frac{1}{6}x^3 + \frac{1}{24}x^4) \f$ | ||
41 | * \code | ||
42 | * int npoints = 4096; | ||
43 | * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment()); | ||
44 | * float* input = (float*)volk_malloc(sizeof(float) * npoints, | ||
45 | * volk_get_alignment()); float* result = (float*)volk_malloc(sizeof(float), | ||
46 | * volk_get_alignment()); float* cutoff = (float*)volk_malloc(sizeof(float), | ||
47 | * volk_get_alignment()); | ||
48 | * // load precomputed Taylor series coefficients | ||
49 | * coefficients[0] = 4.48168907033806f; // c1 | ||
50 | * coefficients[1] = coefficients[0] * 0.5f; // c2 | ||
51 | * coefficients[2] = coefficients[0] * 1.0f/6.0f; // c3 | ||
52 | * coefficients[3] = coefficients[0] * 1.0f/24.0f; // c4 | ||
53 | * coefficients[4] = coefficients[0]; // c0 | ||
54 | * *cutoff = -2.0; | ||
55 | * *result = 0.0f; | ||
56 | * // generate uniform input data | ||
57 | * float dx = (float)M_PI/ (float)npoints; | ||
58 | * for(unsigned int ii=0; ii < npoints; ++ii){ | ||
59 | * input[ii] = dx * (float)ii - 1.5f; | ||
60 | * } | ||
61 | * volk_32f_x3_sum_of_poly_32f(result, input, coefficients, cutoff, npoints); | ||
62 | * // multiply by bin width to get approximate area | ||
63 | * std::cout << "result is " << *result * (input[1]-input[0]) << std::endl; | ||
64 | * volk_free(coefficients); | ||
65 | * volk_free(input); | ||
66 | * volk_free(result); | ||
67 | * volk_free(cutoff); | ||
68 | * \endcode | ||
69 | */ | ||
70 | |||
71 | #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H | ||
72 | #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H | ||
73 | |||
74 | #include <inttypes.h> | ||
75 | #include <stdio.h> | ||
76 | #include <volk/volk_complex.h> | ||
77 | |||
78 | #ifndef MAX | ||
79 | #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) | ||
80 | #endif | ||
81 | |||
82 | #ifdef LV_HAVE_SSE3 | ||
83 | #include <pmmintrin.h> | ||
84 | #include <xmmintrin.h> | ||
85 | |||
86 | 2 | static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, | |
87 | float* src0, | ||
88 | float* center_point_array, | ||
89 | float* cutoff, | ||
90 | unsigned int num_points) | ||
91 | { | ||
92 | 2 | float result = 0.0f; | |
93 | 2 | float fst = 0.0f; | |
94 | 2 | float sq = 0.0f; | |
95 | 2 | float thrd = 0.0f; | |
96 | 2 | float frth = 0.0f; | |
97 | |||
98 | __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10; | ||
99 | |||
100 | 2 | xmm9 = _mm_setzero_ps(); | |
101 | 2 | xmm1 = _mm_setzero_ps(); | |
102 | 2 | xmm0 = _mm_load1_ps(¢er_point_array[0]); | |
103 | 2 | xmm6 = _mm_load1_ps(¢er_point_array[1]); | |
104 | 2 | xmm7 = _mm_load1_ps(¢er_point_array[2]); | |
105 | 4 | xmm8 = _mm_load1_ps(¢er_point_array[3]); | |
106 | 2 | xmm10 = _mm_load1_ps(cutoff); | |
107 | |||
108 | 2 | int bound = num_points / 8; | |
109 | 2 | int leftovers = num_points - 8 * bound; | |
110 | 2 | int i = 0; | |
111 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; i < bound; ++i) { |
112 | // 1st | ||
113 | 32766 | xmm2 = _mm_load_ps(src0); | |
114 | 32766 | xmm2 = _mm_max_ps(xmm10, xmm2); | |
115 | 32766 | xmm3 = _mm_mul_ps(xmm2, xmm2); | |
116 | 32766 | xmm4 = _mm_mul_ps(xmm2, xmm3); | |
117 | 32766 | xmm5 = _mm_mul_ps(xmm3, xmm3); | |
118 | |||
119 | 32766 | xmm2 = _mm_mul_ps(xmm2, xmm0); | |
120 | 32766 | xmm3 = _mm_mul_ps(xmm3, xmm6); | |
121 | 32766 | xmm4 = _mm_mul_ps(xmm4, xmm7); | |
122 | 32766 | xmm5 = _mm_mul_ps(xmm5, xmm8); | |
123 | |||
124 | 32766 | xmm2 = _mm_add_ps(xmm2, xmm3); | |
125 | 32766 | xmm3 = _mm_add_ps(xmm4, xmm5); | |
126 | |||
127 | 32766 | src0 += 4; | |
128 | |||
129 | 32766 | xmm9 = _mm_add_ps(xmm2, xmm9); | |
130 | 32766 | xmm9 = _mm_add_ps(xmm3, xmm9); | |
131 | |||
132 | // 2nd | ||
133 | 32766 | xmm2 = _mm_load_ps(src0); | |
134 | 32766 | xmm2 = _mm_max_ps(xmm10, xmm2); | |
135 | 32766 | xmm3 = _mm_mul_ps(xmm2, xmm2); | |
136 | 32766 | xmm4 = _mm_mul_ps(xmm2, xmm3); | |
137 | 32766 | xmm5 = _mm_mul_ps(xmm3, xmm3); | |
138 | |||
139 | 32766 | xmm2 = _mm_mul_ps(xmm2, xmm0); | |
140 | 32766 | xmm3 = _mm_mul_ps(xmm3, xmm6); | |
141 | 32766 | xmm4 = _mm_mul_ps(xmm4, xmm7); | |
142 | 32766 | xmm5 = _mm_mul_ps(xmm5, xmm8); | |
143 | |||
144 | 32766 | xmm2 = _mm_add_ps(xmm2, xmm3); | |
145 | 32766 | xmm3 = _mm_add_ps(xmm4, xmm5); | |
146 | |||
147 | 32766 | src0 += 4; | |
148 | |||
149 | 32766 | xmm1 = _mm_add_ps(xmm2, xmm1); | |
150 | 32766 | xmm1 = _mm_add_ps(xmm3, xmm1); | |
151 | } | ||
152 | 2 | xmm2 = _mm_hadd_ps(xmm9, xmm1); | |
153 | 2 | xmm3 = _mm_hadd_ps(xmm2, xmm2); | |
154 | 2 | xmm4 = _mm_hadd_ps(xmm3, xmm3); | |
155 | 2 | _mm_store_ss(&result, xmm4); | |
156 | |||
157 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (i = 0; i < leftovers; ++i) { |
158 | 14 | fst = *src0++; | |
159 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
|
14 | fst = MAX(fst, *cutoff); |
160 | 14 | sq = fst * fst; | |
161 | 14 | thrd = fst * sq; | |
162 | 14 | frth = sq * sq; | |
163 | 14 | result += (center_point_array[0] * fst + center_point_array[1] * sq + | |
164 | 14 | center_point_array[2] * thrd + center_point_array[3] * frth); | |
165 | } | ||
166 | |||
167 | 2 | result += (float)(num_points)*center_point_array[4]; | |
168 | 2 | *target = result; | |
169 | 2 | } | |
170 | |||
171 | |||
172 | #endif /*LV_HAVE_SSE3*/ | ||
173 | |||
174 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
175 | #include <immintrin.h> | ||
176 | |||
177 | 2 | static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, | |
178 | float* src0, | ||
179 | float* center_point_array, | ||
180 | float* cutoff, | ||
181 | unsigned int num_points) | ||
182 | { | ||
183 | 2 | const unsigned int eighth_points = num_points / 8; | |
184 | 2 | float fst = 0.0; | |
185 | 2 | float sq = 0.0; | |
186 | 2 | float thrd = 0.0; | |
187 | 2 | float frth = 0.0; | |
188 | |||
189 | __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; | ||
190 | __m256 target_vec; | ||
191 | __m256 x_to_1, x_to_2, x_to_3, x_to_4; | ||
192 | |||
193 | 2 | cpa0 = _mm256_set1_ps(center_point_array[0]); | |
194 | 2 | cpa1 = _mm256_set1_ps(center_point_array[1]); | |
195 | 2 | cpa2 = _mm256_set1_ps(center_point_array[2]); | |
196 | 2 | cpa3 = _mm256_set1_ps(center_point_array[3]); | |
197 | 4 | cutoff_vec = _mm256_set1_ps(*cutoff); | |
198 | 2 | target_vec = _mm256_setzero_ps(); | |
199 | |||
200 | unsigned int i; | ||
201 | |||
202 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < eighth_points; ++i) { |
203 | 32766 | x_to_1 = _mm256_load_ps(src0); | |
204 | 32766 | x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); | |
205 | 32766 | x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 | |
206 | 32766 | x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 | |
207 | // x^1 * x^3 is slightly faster than x^2 * x^2 | ||
208 | 32766 | x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 | |
209 | |||
210 | 32766 | x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 | |
211 | 32766 | x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 | |
212 | |||
213 | 32766 | x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); | |
214 | 32766 | x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); | |
215 | // this is slightly faster than result += (x_to_1 + x_to_3) | ||
216 | 32766 | target_vec = _mm256_add_ps(x_to_1, target_vec); | |
217 | 32766 | target_vec = _mm256_add_ps(x_to_3, target_vec); | |
218 | |||
219 | 32766 | src0 += 8; | |
220 | } | ||
221 | |||
222 | // the hadd for vector reduction has very very slight impact @ 50k iters | ||
223 | __VOLK_ATTR_ALIGNED(32) float temp_results[8]; | ||
224 | 2 | target_vec = _mm256_hadd_ps( | |
225 | target_vec, | ||
226 | target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 | ||
227 | _mm256_store_ps(temp_results, target_vec); | ||
228 | 2 | *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; | |
229 | |||
230 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (i = eighth_points * 8; i < num_points; ++i) { |
231 | 14 | fst = *src0++; | |
232 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
|
14 | fst = MAX(fst, *cutoff); |
233 | 14 | sq = fst * fst; | |
234 | 14 | thrd = fst * sq; | |
235 | 14 | frth = sq * sq; | |
236 | 14 | *target += (center_point_array[0] * fst + center_point_array[1] * sq + | |
237 | 14 | center_point_array[2] * thrd + center_point_array[3] * frth); | |
238 | } | ||
239 | 2 | *target += (float)(num_points)*center_point_array[4]; | |
240 | 2 | } | |
241 | #endif // LV_HAVE_AVX && LV_HAVE_FMA | ||
242 | |||
243 | #ifdef LV_HAVE_AVX | ||
244 | #include <immintrin.h> | ||
245 | |||
246 | 2 | static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, | |
247 | float* src0, | ||
248 | float* center_point_array, | ||
249 | float* cutoff, | ||
250 | unsigned int num_points) | ||
251 | { | ||
252 | 2 | const unsigned int eighth_points = num_points / 8; | |
253 | 2 | float fst = 0.0; | |
254 | 2 | float sq = 0.0; | |
255 | 2 | float thrd = 0.0; | |
256 | 2 | float frth = 0.0; | |
257 | |||
258 | __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; | ||
259 | __m256 target_vec; | ||
260 | __m256 x_to_1, x_to_2, x_to_3, x_to_4; | ||
261 | |||
262 | 2 | cpa0 = _mm256_set1_ps(center_point_array[0]); | |
263 | 2 | cpa1 = _mm256_set1_ps(center_point_array[1]); | |
264 | 2 | cpa2 = _mm256_set1_ps(center_point_array[2]); | |
265 | 2 | cpa3 = _mm256_set1_ps(center_point_array[3]); | |
266 | 4 | cutoff_vec = _mm256_set1_ps(*cutoff); | |
267 | 2 | target_vec = _mm256_setzero_ps(); | |
268 | |||
269 | unsigned int i; | ||
270 | |||
271 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < eighth_points; ++i) { |
272 | 32766 | x_to_1 = _mm256_load_ps(src0); | |
273 | 32766 | x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); | |
274 | 32766 | x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 | |
275 | 32766 | x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 | |
276 | // x^1 * x^3 is slightly faster than x^2 * x^2 | ||
277 | 32766 | x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 | |
278 | |||
279 | 32766 | x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 | |
280 | 32766 | x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 | |
281 | 32766 | x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 | |
282 | 32766 | x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 | |
283 | |||
284 | 32766 | x_to_1 = _mm256_add_ps(x_to_1, x_to_2); | |
285 | 32766 | x_to_3 = _mm256_add_ps(x_to_3, x_to_4); | |
286 | // this is slightly faster than result += (x_to_1 + x_to_3) | ||
287 | 32766 | target_vec = _mm256_add_ps(x_to_1, target_vec); | |
288 | 32766 | target_vec = _mm256_add_ps(x_to_3, target_vec); | |
289 | |||
290 | 32766 | src0 += 8; | |
291 | } | ||
292 | |||
293 | // the hadd for vector reduction has very very slight impact @ 50k iters | ||
294 | __VOLK_ATTR_ALIGNED(32) float temp_results[8]; | ||
295 | 2 | target_vec = _mm256_hadd_ps( | |
296 | target_vec, | ||
297 | target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 | ||
298 | _mm256_store_ps(temp_results, target_vec); | ||
299 | 2 | *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; | |
300 | |||
301 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (i = eighth_points * 8; i < num_points; ++i) { |
302 | 14 | fst = *src0++; | |
303 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
|
14 | fst = MAX(fst, *cutoff); |
304 | 14 | sq = fst * fst; | |
305 | 14 | thrd = fst * sq; | |
306 | 14 | frth = sq * sq; | |
307 | 14 | *target += (center_point_array[0] * fst + center_point_array[1] * sq + | |
308 | 14 | center_point_array[2] * thrd + center_point_array[3] * frth); | |
309 | } | ||
310 | 2 | *target += (float)(num_points)*center_point_array[4]; | |
311 | 2 | } | |
312 | #endif // LV_HAVE_AVX | ||
313 | |||
314 | |||
315 | #ifdef LV_HAVE_GENERIC | ||
316 | |||
317 | 2 | static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, | |
318 | float* src0, | ||
319 | float* center_point_array, | ||
320 | float* cutoff, | ||
321 | unsigned int num_points) | ||
322 | { | ||
323 | 2 | const unsigned int eighth_points = num_points / 8; | |
324 | |||
325 | 2 | float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; | |
326 | 2 | float fst = 0.0f; | |
327 | 2 | float sq = 0.0f; | |
328 | 2 | float thrd = 0.0f; | |
329 | 2 | float frth = 0.0f; | |
330 | |||
331 | 2 | unsigned int i = 0; | |
332 | 2 | unsigned int k = 0; | |
333 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < eighth_points; ++i) { |
334 |
2/2✓ Branch 0 taken 262128 times.
✓ Branch 1 taken 32766 times.
|
294894 | for (k = 0; k < 8; ++k) { |
335 | 262128 | fst = *src0++; | |
336 |
2/2✓ Branch 0 taken 93053 times.
✓ Branch 1 taken 169075 times.
|
262128 | fst = MAX(fst, *cutoff); |
337 | 262128 | sq = fst * fst; | |
338 | 262128 | thrd = fst * sq; | |
339 | 262128 | frth = fst * thrd; | |
340 | 262128 | result[k] += center_point_array[0] * fst + center_point_array[1] * sq; | |
341 | 262128 | result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; | |
342 | } | ||
343 | } | ||
344 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (k = 0; k < 8; k += 2) |
345 | 8 | result[k] = result[k] + result[k + 1]; | |
346 | |||
347 | 2 | *target = result[0] + result[2] + result[4] + result[6]; | |
348 | |||
349 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (i = eighth_points * 8; i < num_points; ++i) { |
350 | 14 | fst = *src0++; | |
351 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
|
14 | fst = MAX(fst, *cutoff); |
352 | 14 | sq = fst * fst; | |
353 | 14 | thrd = fst * sq; | |
354 | 14 | frth = fst * thrd; | |
355 | 14 | *target += (center_point_array[0] * fst + center_point_array[1] * sq + | |
356 | 14 | center_point_array[2] * thrd + center_point_array[3] * frth); | |
357 | } | ||
358 | 2 | *target += (float)(num_points)*center_point_array[4]; | |
359 | 2 | } | |
360 | |||
361 | #endif /*LV_HAVE_GENERIC*/ | ||
362 | |||
363 | #ifdef LV_HAVE_NEON | ||
364 | #include <arm_neon.h> | ||
365 | |||
366 | static inline void | ||
367 | volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, | ||
368 | float* __restrict src0, | ||
369 | float* __restrict center_point_array, | ||
370 | float* __restrict cutoff, | ||
371 | unsigned int num_points) | ||
372 | { | ||
373 | unsigned int i; | ||
374 | float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; | ||
375 | |||
376 | float32x2_t x_to_1, x_to_2, x_to_3, x_to_4; | ||
377 | float32x2_t cutoff_vector; | ||
378 | float32x2x2_t x_low, x_high; | ||
379 | float32x4_t x_qvector, c_qvector, cpa_qvector; | ||
380 | float accumulator; | ||
381 | float res_accumulators[4]; | ||
382 | |||
383 | c_qvector = vld1q_f32(zero); | ||
384 | // load the cutoff in to a vector | ||
385 | cutoff_vector = vdup_n_f32(*cutoff); | ||
386 | // ... center point array | ||
387 | cpa_qvector = vld1q_f32(center_point_array); | ||
388 | |||
389 | for (i = 0; i < num_points; ++i) { | ||
390 | // load x (src0) | ||
391 | x_to_1 = vdup_n_f32(*src0++); | ||
392 | |||
393 | // Get a vector of max(src0, cutoff) | ||
394 | x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1 | ||
395 | x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2 | ||
396 | x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3 | ||
397 | x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4 | ||
398 | // zip up doubles to interleave | ||
399 | x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1] | ||
400 | x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3] | ||
401 | // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 | ||
402 | x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); | ||
403 | // now we finally have [x^4 | x^3 | x^2 | x] ! | ||
404 | |||
405 | c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector); | ||
406 | } | ||
407 | // there should be better vector reduction techniques | ||
408 | vst1q_f32(res_accumulators, c_qvector); | ||
409 | accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] + | ||
410 | res_accumulators[3]; | ||
411 | |||
412 | *target = accumulator + (float)num_points * center_point_array[4]; | ||
413 | } | ||
414 | |||
415 | #endif /* LV_HAVE_NEON */ | ||
416 | |||
417 | |||
418 | #ifdef LV_HAVE_NEON | ||
419 | |||
420 | static inline void | ||
421 | volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, | ||
422 | float* __restrict src0, | ||
423 | float* __restrict center_point_array, | ||
424 | float* __restrict cutoff, | ||
425 | unsigned int num_points) | ||
426 | { | ||
427 | unsigned int i; | ||
428 | float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; | ||
429 | |||
430 | float accumulator; | ||
431 | |||
432 | float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec; | ||
433 | accumulator1_vec = vld1q_f32(zero); | ||
434 | accumulator2_vec = vld1q_f32(zero); | ||
435 | accumulator3_vec = vld1q_f32(zero); | ||
436 | accumulator4_vec = vld1q_f32(zero); | ||
437 | float32x4_t x_to_1, x_to_2, x_to_3, x_to_4; | ||
438 | float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3; | ||
439 | |||
440 | // load the cutoff in to a vector | ||
441 | cutoff_vector = vdupq_n_f32(*cutoff); | ||
442 | // ... center point array | ||
443 | cpa_0 = vdupq_n_f32(center_point_array[0]); | ||
444 | cpa_1 = vdupq_n_f32(center_point_array[1]); | ||
445 | cpa_2 = vdupq_n_f32(center_point_array[2]); | ||
446 | cpa_3 = vdupq_n_f32(center_point_array[3]); | ||
447 | |||
448 | // nathan is not sure why this is slower *and* wrong compared to neonvertfma | ||
449 | for (i = 0; i < num_points / 4; ++i) { | ||
450 | // load x | ||
451 | x_to_1 = vld1q_f32(src0); | ||
452 | |||
453 | // Get a vector of max(src0, cutoff) | ||
454 | x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1 | ||
455 | x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2 | ||
456 | x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3 | ||
457 | x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4 | ||
458 | x_to_1 = vmulq_f32(x_to_1, cpa_0); | ||
459 | x_to_2 = vmulq_f32(x_to_2, cpa_1); | ||
460 | x_to_3 = vmulq_f32(x_to_3, cpa_2); | ||
461 | x_to_4 = vmulq_f32(x_to_4, cpa_3); | ||
462 | accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1); | ||
463 | accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2); | ||
464 | accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3); | ||
465 | accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4); | ||
466 | |||
467 | src0 += 4; | ||
468 | } | ||
469 | accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec); | ||
470 | accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec); | ||
471 | accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec); | ||
472 | |||
473 | __VOLK_ATTR_ALIGNED(32) float res_accumulators[4]; | ||
474 | vst1q_f32(res_accumulators, accumulator1_vec); | ||
475 | accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] + | ||
476 | res_accumulators[3]; | ||
477 | |||
478 | float fst = 0.0; | ||
479 | float sq = 0.0; | ||
480 | float thrd = 0.0; | ||
481 | float frth = 0.0; | ||
482 | |||
483 | for (i = 4 * (num_points / 4); i < num_points; ++i) { | ||
484 | fst = *src0++; | ||
485 | fst = MAX(fst, *cutoff); | ||
486 | |||
487 | sq = fst * fst; | ||
488 | thrd = fst * sq; | ||
489 | frth = sq * sq; | ||
490 | // fith = sq * thrd; | ||
491 | |||
492 | accumulator += (center_point_array[0] * fst + center_point_array[1] * sq + | ||
493 | center_point_array[2] * thrd + center_point_array[3] * frth); //+ | ||
494 | } | ||
495 | |||
496 | *target = accumulator + (float)num_points * center_point_array[4]; | ||
497 | } | ||
498 | |||
499 | #endif /* LV_HAVE_NEON */ | ||
500 | |||
501 | #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/ | ||
502 | |||
503 | #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H | ||
504 | #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H | ||
505 | |||
506 | #include <inttypes.h> | ||
507 | #include <stdio.h> | ||
508 | #include <volk/volk_complex.h> | ||
509 | |||
510 | #ifndef MAX | ||
511 | #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) | ||
512 | #endif | ||
513 | |||
514 | #if LV_HAVE_AVX && LV_HAVE_FMA | ||
515 | #include <immintrin.h> | ||
516 | |||
517 | 2 | static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, | |
518 | float* src0, | ||
519 | float* center_point_array, | ||
520 | float* cutoff, | ||
521 | unsigned int num_points) | ||
522 | { | ||
523 | 2 | const unsigned int eighth_points = num_points / 8; | |
524 | 2 | float fst = 0.0; | |
525 | 2 | float sq = 0.0; | |
526 | 2 | float thrd = 0.0; | |
527 | 2 | float frth = 0.0; | |
528 | |||
529 | __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; | ||
530 | __m256 target_vec; | ||
531 | __m256 x_to_1, x_to_2, x_to_3, x_to_4; | ||
532 | |||
533 | 2 | cpa0 = _mm256_set1_ps(center_point_array[0]); | |
534 | 2 | cpa1 = _mm256_set1_ps(center_point_array[1]); | |
535 | 2 | cpa2 = _mm256_set1_ps(center_point_array[2]); | |
536 | 2 | cpa3 = _mm256_set1_ps(center_point_array[3]); | |
537 | 4 | cutoff_vec = _mm256_set1_ps(*cutoff); | |
538 | 2 | target_vec = _mm256_setzero_ps(); | |
539 | |||
540 | unsigned int i; | ||
541 | |||
542 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < eighth_points; ++i) { |
543 | 32766 | x_to_1 = _mm256_loadu_ps(src0); | |
544 | 32766 | x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); | |
545 | 32766 | x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 | |
546 | 32766 | x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 | |
547 | // x^1 * x^3 is slightly faster than x^2 * x^2 | ||
548 | 32766 | x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 | |
549 | |||
550 | 32766 | x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 | |
551 | 32766 | x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 | |
552 | |||
553 | 32766 | x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); | |
554 | 32766 | x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); | |
555 | // this is slightly faster than result += (x_to_1 + x_to_3) | ||
556 | 32766 | target_vec = _mm256_add_ps(x_to_1, target_vec); | |
557 | 32766 | target_vec = _mm256_add_ps(x_to_3, target_vec); | |
558 | |||
559 | 32766 | src0 += 8; | |
560 | } | ||
561 | |||
562 | // the hadd for vector reduction has very very slight impact @ 50k iters | ||
563 | __VOLK_ATTR_ALIGNED(32) float temp_results[8]; | ||
564 | 2 | target_vec = _mm256_hadd_ps( | |
565 | target_vec, | ||
566 | target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 | ||
567 | _mm256_storeu_ps(temp_results, target_vec); | ||
568 | 2 | *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; | |
569 | |||
570 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (i = eighth_points * 8; i < num_points; ++i) { |
571 | 14 | fst = *src0++; | |
572 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
|
14 | fst = MAX(fst, *cutoff); |
573 | 14 | sq = fst * fst; | |
574 | 14 | thrd = fst * sq; | |
575 | 14 | frth = sq * sq; | |
576 | 14 | *target += (center_point_array[0] * fst + center_point_array[1] * sq + | |
577 | 14 | center_point_array[2] * thrd + center_point_array[3] * frth); | |
578 | } | ||
579 | |||
580 | 2 | *target += (float)(num_points)*center_point_array[4]; | |
581 | 2 | } | |
582 | #endif // LV_HAVE_AVX && LV_HAVE_FMA | ||
583 | |||
584 | #ifdef LV_HAVE_AVX | ||
585 | #include <immintrin.h> | ||
586 | |||
587 | 2 | static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, | |
588 | float* src0, | ||
589 | float* center_point_array, | ||
590 | float* cutoff, | ||
591 | unsigned int num_points) | ||
592 | { | ||
593 | 2 | const unsigned int eighth_points = num_points / 8; | |
594 | 2 | float fst = 0.0; | |
595 | 2 | float sq = 0.0; | |
596 | 2 | float thrd = 0.0; | |
597 | 2 | float frth = 0.0; | |
598 | |||
599 | __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; | ||
600 | __m256 target_vec; | ||
601 | __m256 x_to_1, x_to_2, x_to_3, x_to_4; | ||
602 | |||
603 | 2 | cpa0 = _mm256_set1_ps(center_point_array[0]); | |
604 | 2 | cpa1 = _mm256_set1_ps(center_point_array[1]); | |
605 | 2 | cpa2 = _mm256_set1_ps(center_point_array[2]); | |
606 | 2 | cpa3 = _mm256_set1_ps(center_point_array[3]); | |
607 | 4 | cutoff_vec = _mm256_set1_ps(*cutoff); | |
608 | 2 | target_vec = _mm256_setzero_ps(); | |
609 | |||
610 | unsigned int i; | ||
611 | |||
612 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (i = 0; i < eighth_points; ++i) { |
613 | 32766 | x_to_1 = _mm256_loadu_ps(src0); | |
614 | 32766 | x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); | |
615 | 32766 | x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 | |
616 | 32766 | x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 | |
617 | // x^1 * x^3 is slightly faster than x^2 * x^2 | ||
618 | 32766 | x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 | |
619 | |||
620 | 32766 | x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 | |
621 | 32766 | x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 | |
622 | 32766 | x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 | |
623 | 32766 | x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 | |
624 | |||
625 | 32766 | x_to_1 = _mm256_add_ps(x_to_1, x_to_2); | |
626 | 32766 | x_to_3 = _mm256_add_ps(x_to_3, x_to_4); | |
627 | // this is slightly faster than result += (x_to_1 + x_to_3) | ||
628 | 32766 | target_vec = _mm256_add_ps(x_to_1, target_vec); | |
629 | 32766 | target_vec = _mm256_add_ps(x_to_3, target_vec); | |
630 | |||
631 | 32766 | src0 += 8; | |
632 | } | ||
633 | |||
634 | // the hadd for vector reduction has very very slight impact @ 50k iters | ||
635 | __VOLK_ATTR_ALIGNED(32) float temp_results[8]; | ||
636 | 2 | target_vec = _mm256_hadd_ps( | |
637 | target_vec, | ||
638 | target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 | ||
639 | _mm256_storeu_ps(temp_results, target_vec); | ||
640 | 2 | *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; | |
641 | |||
642 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (i = eighth_points * 8; i < num_points; ++i) { |
643 | 14 | fst = *src0++; | |
644 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 9 times.
|
14 | fst = MAX(fst, *cutoff); |
645 | 14 | sq = fst * fst; | |
646 | 14 | thrd = fst * sq; | |
647 | 14 | frth = sq * sq; | |
648 | |||
649 | 14 | *target += (center_point_array[0] * fst + center_point_array[1] * sq + | |
650 | 14 | center_point_array[2] * thrd + center_point_array[3] * frth); | |
651 | } | ||
652 | |||
653 | 2 | *target += (float)(num_points)*center_point_array[4]; | |
654 | 2 | } | |
655 | #endif // LV_HAVE_AVX | ||
656 | |||
657 | #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/ | ||
658 |