Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_x2_pow_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Raises the sample in aVector to the power of the number in bVector. | ||
16 | * | ||
17 | * c[i] = pow(a[i], b[i]) | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, | ||
22 | * unsigned int num_points) \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li bVector: The input vector of indices (power values). | ||
26 | * \li aVector: The input vector of base values. | ||
27 | * \li num_points: The number of values in both input vectors. | ||
28 | * | ||
29 | * \b Outputs | ||
30 | * \li cVector: The output vector. | ||
31 | * | ||
32 | * \b Example | ||
33 | * Calculate the first two powers of two (2^x). | ||
34 | * \code | ||
35 | * int N = 10; | ||
36 | * unsigned int alignment = volk_get_alignment(); | ||
37 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
38 | * float* twos = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
39 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
40 | * | ||
41 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
42 | * increasing[ii] = (float)ii; | ||
43 | * twos[ii] = 2.f; | ||
44 | * } | ||
45 | * | ||
46 | * volk_32f_x2_pow_32f(out, increasing, twos, N); | ||
47 | * | ||
48 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
49 | * printf("out[%u] = %1.2f\n", ii, out[ii]); | ||
50 | * } | ||
51 | * | ||
52 | * volk_free(increasing); | ||
53 | * volk_free(twos); | ||
54 | * volk_free(out); | ||
55 | * \endcode | ||
56 | */ | ||
57 | |||
58 | #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H | ||
59 | #define INCLUDED_volk_32f_x2_pow_32f_a_H | ||
60 | |||
61 | #include <inttypes.h> | ||
62 | #include <math.h> | ||
63 | #include <stdio.h> | ||
64 | #include <stdlib.h> | ||
65 | |||
66 | #define POW_POLY_DEGREE 3 | ||
67 | |||
68 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
69 | #include <immintrin.h> | ||
70 | |||
71 | #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) | ||
72 | #define POLY1_AVX2_FMA(x, c0, c1) \ | ||
73 | _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) | ||
74 | #define POLY2_AVX2_FMA(x, c0, c1, c2) \ | ||
75 | _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) | ||
76 | #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ | ||
77 | _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) | ||
78 | #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ | ||
79 | _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) | ||
80 | #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ | ||
81 | _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) | ||
82 | |||
83 | 2 | static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, | |
84 | const float* bVector, | ||
85 | const float* aVector, | ||
86 | unsigned int num_points) | ||
87 | { | ||
88 | 2 | float* cPtr = cVector; | |
89 | 2 | const float* bPtr = bVector; | |
90 | 2 | const float* aPtr = aVector; | |
91 | |||
92 | 2 | unsigned int number = 0; | |
93 | 2 | const unsigned int eighthPoints = num_points / 8; | |
94 | |||
95 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
96 | __m256 tmp, fx, mask, pow2n, z, y; | ||
97 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
98 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
99 | __m256i bias, exp, emm0, pi32_0x7f; | ||
100 | |||
101 | 2 | one = _mm256_set1_ps(1.0); | |
102 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
103 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
104 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
105 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
106 | 2 | half = _mm256_set1_ps(0.5); | |
107 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
108 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
109 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
110 | |||
111 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
112 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
113 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
114 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
115 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
116 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
117 | |||
118 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
119 | // First compute the logarithm | ||
120 | 32766 | aVal = _mm256_load_ps(aPtr); | |
121 | 32766 | bias = _mm256_set1_epi32(127); | |
122 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
123 | 163830 | exp = _mm256_sub_epi32( | |
124 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
125 | _mm256_set1_epi32(0x7f800000)), | ||
126 | 23), | ||
127 | bias); | ||
128 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
129 | |||
130 | 131064 | frac = _mm256_or_ps( | |
131 | leadingOne, | ||
132 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
133 | |||
134 | #if POW_POLY_DEGREE == 6 | ||
135 | mantissa = POLY5_AVX2_FMA(frac, | ||
136 | 3.1157899f, | ||
137 | -3.3241990f, | ||
138 | 2.5988452f, | ||
139 | -1.2315303f, | ||
140 | 3.1821337e-1f, | ||
141 | -3.4436006e-2f); | ||
142 | #elif POW_POLY_DEGREE == 5 | ||
143 | mantissa = POLY4_AVX2_FMA(frac, | ||
144 | 2.8882704548164776201f, | ||
145 | -2.52074962577807006663f, | ||
146 | 1.48116647521213171641f, | ||
147 | -0.465725644288844778798f, | ||
148 | 0.0596515482674574969533f); | ||
149 | #elif POW_POLY_DEGREE == 4 | ||
150 | mantissa = POLY3_AVX2_FMA(frac, | ||
151 | 2.61761038894603480148f, | ||
152 | -1.75647175389045657003f, | ||
153 | 0.688243882994381274313f, | ||
154 | -0.107254423828329604454f); | ||
155 | #elif POW_POLY_DEGREE == 3 | ||
156 | 163830 | mantissa = POLY2_AVX2_FMA(frac, | |
157 | 2.28330284476918490682f, | ||
158 | -1.04913055217340124191f, | ||
159 | 0.204446009836232697516f); | ||
160 | #else | ||
161 | #error | ||
162 | #endif | ||
163 | |||
164 | 65532 | logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); | |
165 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
166 | |||
167 | // Now calculate b*lna | ||
168 | 32766 | bVal = _mm256_load_ps(bPtr); | |
169 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
170 | |||
171 | // Now compute exp(b*lna) | ||
172 | 65532 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
173 | |||
174 | 32766 | fx = _mm256_fmadd_ps(bVal, log2EF, half); | |
175 | |||
176 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
177 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
178 | |||
179 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
180 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
181 | |||
182 | 32766 | tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); | |
183 | 32766 | bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); | |
184 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
185 | |||
186 | 32766 | y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); | |
187 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p2); | |
188 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p3); | |
189 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p4); | |
190 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p5); | |
191 | 32766 | y = _mm256_fmadd_ps(y, z, bVal); | |
192 | 32766 | y = _mm256_add_ps(y, one); | |
193 | |||
194 | emm0 = | ||
195 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
196 | |||
197 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
198 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
199 | |||
200 | _mm256_store_ps(cPtr, cVal); | ||
201 | |||
202 | 32766 | aPtr += 8; | |
203 | 32766 | bPtr += 8; | |
204 | 32766 | cPtr += 8; | |
205 | } | ||
206 | |||
207 | 2 | number = eighthPoints * 8; | |
208 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
209 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
210 | } | ||
211 | 2 | } | |
212 | |||
213 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ | ||
214 | |||
215 | #ifdef LV_HAVE_AVX2 | ||
216 | #include <immintrin.h> | ||
217 | |||
218 | #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) | ||
219 | #define POLY1_AVX2(x, c0, c1) \ | ||
220 | _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) | ||
221 | #define POLY2_AVX2(x, c0, c1, c2) \ | ||
222 | _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) | ||
223 | #define POLY3_AVX2(x, c0, c1, c2, c3) \ | ||
224 | _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) | ||
225 | #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ | ||
226 | _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) | ||
227 | #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
228 | _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) | ||
229 | |||
230 | 2 | static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector, | |
231 | const float* bVector, | ||
232 | const float* aVector, | ||
233 | unsigned int num_points) | ||
234 | { | ||
235 | 2 | float* cPtr = cVector; | |
236 | 2 | const float* bPtr = bVector; | |
237 | 2 | const float* aPtr = aVector; | |
238 | |||
239 | 2 | unsigned int number = 0; | |
240 | 2 | const unsigned int eighthPoints = num_points / 8; | |
241 | |||
242 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
243 | __m256 tmp, fx, mask, pow2n, z, y; | ||
244 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
245 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
246 | __m256i bias, exp, emm0, pi32_0x7f; | ||
247 | |||
248 | 2 | one = _mm256_set1_ps(1.0); | |
249 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
250 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
251 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
252 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
253 | 2 | half = _mm256_set1_ps(0.5); | |
254 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
255 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
256 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
257 | |||
258 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
259 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
260 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
261 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
262 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
263 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
264 | |||
265 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
266 | // First compute the logarithm | ||
267 | 32766 | aVal = _mm256_load_ps(aPtr); | |
268 | 32766 | bias = _mm256_set1_epi32(127); | |
269 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
270 | 163830 | exp = _mm256_sub_epi32( | |
271 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
272 | _mm256_set1_epi32(0x7f800000)), | ||
273 | 23), | ||
274 | bias); | ||
275 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
276 | |||
277 | 131064 | frac = _mm256_or_ps( | |
278 | leadingOne, | ||
279 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
280 | |||
281 | #if POW_POLY_DEGREE == 6 | ||
282 | mantissa = POLY5_AVX2(frac, | ||
283 | 3.1157899f, | ||
284 | -3.3241990f, | ||
285 | 2.5988452f, | ||
286 | -1.2315303f, | ||
287 | 3.1821337e-1f, | ||
288 | -3.4436006e-2f); | ||
289 | #elif POW_POLY_DEGREE == 5 | ||
290 | mantissa = POLY4_AVX2(frac, | ||
291 | 2.8882704548164776201f, | ||
292 | -2.52074962577807006663f, | ||
293 | 1.48116647521213171641f, | ||
294 | -0.465725644288844778798f, | ||
295 | 0.0596515482674574969533f); | ||
296 | #elif POW_POLY_DEGREE == 4 | ||
297 | mantissa = POLY3_AVX2(frac, | ||
298 | 2.61761038894603480148f, | ||
299 | -1.75647175389045657003f, | ||
300 | 0.688243882994381274313f, | ||
301 | -0.107254423828329604454f); | ||
302 | #elif POW_POLY_DEGREE == 3 | ||
303 | 229362 | mantissa = POLY2_AVX2(frac, | |
304 | 2.28330284476918490682f, | ||
305 | -1.04913055217340124191f, | ||
306 | 0.204446009836232697516f); | ||
307 | #else | ||
308 | #error | ||
309 | #endif | ||
310 | |||
311 | 98298 | logarithm = _mm256_add_ps( | |
312 | _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); | ||
313 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
314 | |||
315 | // Now calculate b*lna | ||
316 | 32766 | bVal = _mm256_load_ps(bPtr); | |
317 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
318 | |||
319 | // Now compute exp(b*lna) | ||
320 | 98298 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
321 | |||
322 | 65532 | fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); | |
323 | |||
324 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
325 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
326 | |||
327 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
328 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
329 | |||
330 | 65532 | tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); | |
331 | 65532 | bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); | |
332 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
333 | |||
334 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); | |
335 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); | |
336 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); | |
337 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); | |
338 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); | |
339 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); | |
340 | 32766 | y = _mm256_add_ps(y, one); | |
341 | |||
342 | emm0 = | ||
343 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
344 | |||
345 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
346 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
347 | |||
348 | _mm256_store_ps(cPtr, cVal); | ||
349 | |||
350 | 32766 | aPtr += 8; | |
351 | 32766 | bPtr += 8; | |
352 | 32766 | cPtr += 8; | |
353 | } | ||
354 | |||
355 | 2 | number = eighthPoints * 8; | |
356 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
357 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
358 | } | ||
359 | 2 | } | |
360 | |||
361 | #endif /* LV_HAVE_AVX2 for aligned */ | ||
362 | |||
363 | |||
364 | #ifdef LV_HAVE_SSE4_1 | ||
365 | #include <smmintrin.h> | ||
366 | |||
367 | #define POLY0(x, c0) _mm_set1_ps(c0) | ||
368 | #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) | ||
369 | #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) | ||
370 | #define POLY3(x, c0, c1, c2, c3) \ | ||
371 | _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) | ||
372 | #define POLY4(x, c0, c1, c2, c3, c4) \ | ||
373 | _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) | ||
374 | #define POLY5(x, c0, c1, c2, c3, c4, c5) \ | ||
375 | _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) | ||
376 | |||
377 | 2 | static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, | |
378 | const float* bVector, | ||
379 | const float* aVector, | ||
380 | unsigned int num_points) | ||
381 | { | ||
382 | 2 | float* cPtr = cVector; | |
383 | 2 | const float* bPtr = bVector; | |
384 | 2 | const float* aPtr = aVector; | |
385 | |||
386 | 2 | unsigned int number = 0; | |
387 | 2 | const unsigned int quarterPoints = num_points / 4; | |
388 | |||
389 | __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
390 | __m128 tmp, fx, mask, pow2n, z, y; | ||
391 | __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
392 | __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
393 | __m128i bias, exp, emm0, pi32_0x7f; | ||
394 | |||
395 | 2 | one = _mm_set1_ps(1.0); | |
396 | 2 | exp_hi = _mm_set1_ps(88.3762626647949); | |
397 | 2 | exp_lo = _mm_set1_ps(-88.3762626647949); | |
398 | 2 | ln2 = _mm_set1_ps(0.6931471805); | |
399 | 2 | log2EF = _mm_set1_ps(1.44269504088896341); | |
400 | 2 | half = _mm_set1_ps(0.5); | |
401 | 2 | exp_C1 = _mm_set1_ps(0.693359375); | |
402 | 2 | exp_C2 = _mm_set1_ps(-2.12194440e-4); | |
403 | 2 | pi32_0x7f = _mm_set1_epi32(0x7f); | |
404 | |||
405 | 2 | exp_p0 = _mm_set1_ps(1.9875691500e-4); | |
406 | 2 | exp_p1 = _mm_set1_ps(1.3981999507e-3); | |
407 | 2 | exp_p2 = _mm_set1_ps(8.3334519073e-3); | |
408 | 2 | exp_p3 = _mm_set1_ps(4.1665795894e-2); | |
409 | 2 | exp_p4 = _mm_set1_ps(1.6666665459e-1); | |
410 | 2 | exp_p5 = _mm_set1_ps(5.0000001201e-1); | |
411 | |||
412 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
413 | // First compute the logarithm | ||
414 | 65534 | aVal = _mm_load_ps(aPtr); | |
415 | 65534 | bias = _mm_set1_epi32(127); | |
416 | 65534 | leadingOne = _mm_set1_ps(1.0f); | |
417 | 327670 | exp = _mm_sub_epi32( | |
418 | _mm_srli_epi32( | ||
419 | _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), | ||
420 | bias); | ||
421 | 65534 | logarithm = _mm_cvtepi32_ps(exp); | |
422 | |||
423 | 262136 | frac = _mm_or_ps(leadingOne, | |
424 | _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); | ||
425 | |||
426 | #if POW_POLY_DEGREE == 6 | ||
427 | mantissa = POLY5(frac, | ||
428 | 3.1157899f, | ||
429 | -3.3241990f, | ||
430 | 2.5988452f, | ||
431 | -1.2315303f, | ||
432 | 3.1821337e-1f, | ||
433 | -3.4436006e-2f); | ||
434 | #elif POW_POLY_DEGREE == 5 | ||
435 | mantissa = POLY4(frac, | ||
436 | 2.8882704548164776201f, | ||
437 | -2.52074962577807006663f, | ||
438 | 1.48116647521213171641f, | ||
439 | -0.465725644288844778798f, | ||
440 | 0.0596515482674574969533f); | ||
441 | #elif POW_POLY_DEGREE == 4 | ||
442 | mantissa = POLY3(frac, | ||
443 | 2.61761038894603480148f, | ||
444 | -1.75647175389045657003f, | ||
445 | 0.688243882994381274313f, | ||
446 | -0.107254423828329604454f); | ||
447 | #elif POW_POLY_DEGREE == 3 | ||
448 | 458738 | mantissa = POLY2(frac, | |
449 | 2.28330284476918490682f, | ||
450 | -1.04913055217340124191f, | ||
451 | 0.204446009836232697516f); | ||
452 | #else | ||
453 | #error | ||
454 | #endif | ||
455 | |||
456 | logarithm = | ||
457 | 196602 | _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); | |
458 | 65534 | logarithm = _mm_mul_ps(logarithm, ln2); | |
459 | |||
460 | |||
461 | // Now calculate b*lna | ||
462 | 65534 | bVal = _mm_load_ps(bPtr); | |
463 | 65534 | bVal = _mm_mul_ps(bVal, logarithm); | |
464 | |||
465 | // Now compute exp(b*lna) | ||
466 | 131068 | bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); | |
467 | |||
468 | 131068 | fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); | |
469 | |||
470 | 65534 | emm0 = _mm_cvttps_epi32(fx); | |
471 | 65534 | tmp = _mm_cvtepi32_ps(emm0); | |
472 | |||
473 | 131068 | mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); | |
474 | 65534 | fx = _mm_sub_ps(tmp, mask); | |
475 | |||
476 | 65534 | tmp = _mm_mul_ps(fx, exp_C1); | |
477 | 65534 | z = _mm_mul_ps(fx, exp_C2); | |
478 | 131068 | bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); | |
479 | 65534 | z = _mm_mul_ps(bVal, bVal); | |
480 | |||
481 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); | |
482 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); | |
483 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); | |
484 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); | |
485 | 65534 | y = _mm_add_ps(y, one); | |
486 | |||
487 | 196602 | emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); | |
488 | |||
489 | 65534 | pow2n = _mm_castsi128_ps(emm0); | |
490 | 65534 | cVal = _mm_mul_ps(y, pow2n); | |
491 | |||
492 | _mm_store_ps(cPtr, cVal); | ||
493 | |||
494 | 65534 | aPtr += 4; | |
495 | 65534 | bPtr += 4; | |
496 | 65534 | cPtr += 4; | |
497 | } | ||
498 | |||
499 | 2 | number = quarterPoints * 4; | |
500 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
501 | 6 | *cPtr++ = powf(*aPtr++, *bPtr++); | |
502 | } | ||
503 | 2 | } | |
504 | |||
505 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
506 | |||
507 | #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */ | ||
508 | |||
509 | #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H | ||
510 | #define INCLUDED_volk_32f_x2_pow_32f_u_H | ||
511 | |||
512 | #include <inttypes.h> | ||
513 | #include <math.h> | ||
514 | #include <stdio.h> | ||
515 | #include <stdlib.h> | ||
516 | |||
517 | #define POW_POLY_DEGREE 3 | ||
518 | |||
519 | #ifdef LV_HAVE_GENERIC | ||
520 | |||
521 | 2 | static inline void volk_32f_x2_pow_32f_generic(float* cVector, | |
522 | const float* bVector, | ||
523 | const float* aVector, | ||
524 | unsigned int num_points) | ||
525 | { | ||
526 | 2 | float* cPtr = cVector; | |
527 | 2 | const float* bPtr = bVector; | |
528 | 2 | const float* aPtr = aVector; | |
529 | 2 | unsigned int number = 0; | |
530 | |||
531 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
532 | 262142 | *cPtr++ = powf(*aPtr++, *bPtr++); | |
533 | } | ||
534 | 2 | } | |
535 | #endif /* LV_HAVE_GENERIC */ | ||
536 | |||
537 | |||
538 | #ifdef LV_HAVE_SSE4_1 | ||
539 | #include <smmintrin.h> | ||
540 | |||
541 | #define POLY0(x, c0) _mm_set1_ps(c0) | ||
542 | #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) | ||
543 | #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) | ||
544 | #define POLY3(x, c0, c1, c2, c3) \ | ||
545 | _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) | ||
546 | #define POLY4(x, c0, c1, c2, c3, c4) \ | ||
547 | _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) | ||
548 | #define POLY5(x, c0, c1, c2, c3, c4, c5) \ | ||
549 | _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) | ||
550 | |||
551 | 2 | static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, | |
552 | const float* bVector, | ||
553 | const float* aVector, | ||
554 | unsigned int num_points) | ||
555 | { | ||
556 | 2 | float* cPtr = cVector; | |
557 | 2 | const float* bPtr = bVector; | |
558 | 2 | const float* aPtr = aVector; | |
559 | |||
560 | 2 | unsigned int number = 0; | |
561 | 2 | const unsigned int quarterPoints = num_points / 4; | |
562 | |||
563 | __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
564 | __m128 tmp, fx, mask, pow2n, z, y; | ||
565 | __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
566 | __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
567 | __m128i bias, exp, emm0, pi32_0x7f; | ||
568 | |||
569 | 2 | one = _mm_set1_ps(1.0); | |
570 | 2 | exp_hi = _mm_set1_ps(88.3762626647949); | |
571 | 2 | exp_lo = _mm_set1_ps(-88.3762626647949); | |
572 | 2 | ln2 = _mm_set1_ps(0.6931471805); | |
573 | 2 | log2EF = _mm_set1_ps(1.44269504088896341); | |
574 | 2 | half = _mm_set1_ps(0.5); | |
575 | 2 | exp_C1 = _mm_set1_ps(0.693359375); | |
576 | 2 | exp_C2 = _mm_set1_ps(-2.12194440e-4); | |
577 | 2 | pi32_0x7f = _mm_set1_epi32(0x7f); | |
578 | |||
579 | 2 | exp_p0 = _mm_set1_ps(1.9875691500e-4); | |
580 | 2 | exp_p1 = _mm_set1_ps(1.3981999507e-3); | |
581 | 2 | exp_p2 = _mm_set1_ps(8.3334519073e-3); | |
582 | 2 | exp_p3 = _mm_set1_ps(4.1665795894e-2); | |
583 | 2 | exp_p4 = _mm_set1_ps(1.6666665459e-1); | |
584 | 2 | exp_p5 = _mm_set1_ps(5.0000001201e-1); | |
585 | |||
586 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
587 | // First compute the logarithm | ||
588 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
589 | 65534 | bias = _mm_set1_epi32(127); | |
590 | 65534 | leadingOne = _mm_set1_ps(1.0f); | |
591 | 327670 | exp = _mm_sub_epi32( | |
592 | _mm_srli_epi32( | ||
593 | _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), | ||
594 | bias); | ||
595 | 65534 | logarithm = _mm_cvtepi32_ps(exp); | |
596 | |||
597 | 262136 | frac = _mm_or_ps(leadingOne, | |
598 | _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); | ||
599 | |||
600 | #if POW_POLY_DEGREE == 6 | ||
601 | mantissa = POLY5(frac, | ||
602 | 3.1157899f, | ||
603 | -3.3241990f, | ||
604 | 2.5988452f, | ||
605 | -1.2315303f, | ||
606 | 3.1821337e-1f, | ||
607 | -3.4436006e-2f); | ||
608 | #elif POW_POLY_DEGREE == 5 | ||
609 | mantissa = POLY4(frac, | ||
610 | 2.8882704548164776201f, | ||
611 | -2.52074962577807006663f, | ||
612 | 1.48116647521213171641f, | ||
613 | -0.465725644288844778798f, | ||
614 | 0.0596515482674574969533f); | ||
615 | #elif POW_POLY_DEGREE == 4 | ||
616 | mantissa = POLY3(frac, | ||
617 | 2.61761038894603480148f, | ||
618 | -1.75647175389045657003f, | ||
619 | 0.688243882994381274313f, | ||
620 | -0.107254423828329604454f); | ||
621 | #elif POW_POLY_DEGREE == 3 | ||
622 | 458738 | mantissa = POLY2(frac, | |
623 | 2.28330284476918490682f, | ||
624 | -1.04913055217340124191f, | ||
625 | 0.204446009836232697516f); | ||
626 | #else | ||
627 | #error | ||
628 | #endif | ||
629 | |||
630 | logarithm = | ||
631 | 196602 | _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); | |
632 | 65534 | logarithm = _mm_mul_ps(logarithm, ln2); | |
633 | |||
634 | |||
635 | // Now calculate b*lna | ||
636 | 65534 | bVal = _mm_loadu_ps(bPtr); | |
637 | 65534 | bVal = _mm_mul_ps(bVal, logarithm); | |
638 | |||
639 | // Now compute exp(b*lna) | ||
640 | 131068 | bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); | |
641 | |||
642 | 131068 | fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); | |
643 | |||
644 | 65534 | emm0 = _mm_cvttps_epi32(fx); | |
645 | 65534 | tmp = _mm_cvtepi32_ps(emm0); | |
646 | |||
647 | 131068 | mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); | |
648 | 65534 | fx = _mm_sub_ps(tmp, mask); | |
649 | |||
650 | 65534 | tmp = _mm_mul_ps(fx, exp_C1); | |
651 | 65534 | z = _mm_mul_ps(fx, exp_C2); | |
652 | 131068 | bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); | |
653 | 65534 | z = _mm_mul_ps(bVal, bVal); | |
654 | |||
655 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); | |
656 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); | |
657 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); | |
658 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); | |
659 | 65534 | y = _mm_add_ps(y, one); | |
660 | |||
661 | 196602 | emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); | |
662 | |||
663 | 65534 | pow2n = _mm_castsi128_ps(emm0); | |
664 | 65534 | cVal = _mm_mul_ps(y, pow2n); | |
665 | |||
666 | _mm_storeu_ps(cPtr, cVal); | ||
667 | |||
668 | 65534 | aPtr += 4; | |
669 | 65534 | bPtr += 4; | |
670 | 65534 | cPtr += 4; | |
671 | } | ||
672 | |||
673 | 2 | number = quarterPoints * 4; | |
674 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
675 | 6 | *cPtr++ = powf(*aPtr++, *bPtr++); | |
676 | } | ||
677 | 2 | } | |
678 | |||
679 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
680 | |||
681 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
682 | #include <immintrin.h> | ||
683 | |||
684 | #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) | ||
685 | #define POLY1_AVX2_FMA(x, c0, c1) \ | ||
686 | _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) | ||
687 | #define POLY2_AVX2_FMA(x, c0, c1, c2) \ | ||
688 | _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) | ||
689 | #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ | ||
690 | _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) | ||
691 | #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ | ||
692 | _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) | ||
693 | #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ | ||
694 | _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) | ||
695 | |||
696 | 2 | static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, | |
697 | const float* bVector, | ||
698 | const float* aVector, | ||
699 | unsigned int num_points) | ||
700 | { | ||
701 | 2 | float* cPtr = cVector; | |
702 | 2 | const float* bPtr = bVector; | |
703 | 2 | const float* aPtr = aVector; | |
704 | |||
705 | 2 | unsigned int number = 0; | |
706 | 2 | const unsigned int eighthPoints = num_points / 8; | |
707 | |||
708 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
709 | __m256 tmp, fx, mask, pow2n, z, y; | ||
710 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
711 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
712 | __m256i bias, exp, emm0, pi32_0x7f; | ||
713 | |||
714 | 2 | one = _mm256_set1_ps(1.0); | |
715 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
716 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
717 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
718 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
719 | 2 | half = _mm256_set1_ps(0.5); | |
720 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
721 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
722 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
723 | |||
724 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
725 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
726 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
727 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
728 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
729 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
730 | |||
731 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
732 | // First compute the logarithm | ||
733 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
734 | 32766 | bias = _mm256_set1_epi32(127); | |
735 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
736 | 163830 | exp = _mm256_sub_epi32( | |
737 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
738 | _mm256_set1_epi32(0x7f800000)), | ||
739 | 23), | ||
740 | bias); | ||
741 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
742 | |||
743 | 131064 | frac = _mm256_or_ps( | |
744 | leadingOne, | ||
745 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
746 | |||
747 | #if POW_POLY_DEGREE == 6 | ||
748 | mantissa = POLY5_AVX2_FMA(frac, | ||
749 | 3.1157899f, | ||
750 | -3.3241990f, | ||
751 | 2.5988452f, | ||
752 | -1.2315303f, | ||
753 | 3.1821337e-1f, | ||
754 | -3.4436006e-2f); | ||
755 | #elif POW_POLY_DEGREE == 5 | ||
756 | mantissa = POLY4_AVX2_FMA(frac, | ||
757 | 2.8882704548164776201f, | ||
758 | -2.52074962577807006663f, | ||
759 | 1.48116647521213171641f, | ||
760 | -0.465725644288844778798f, | ||
761 | 0.0596515482674574969533f); | ||
762 | #elif POW_POLY_DEGREE == 4 | ||
763 | mantissa = POLY3_AVX2_FMA(frac, | ||
764 | 2.61761038894603480148f, | ||
765 | -1.75647175389045657003f, | ||
766 | 0.688243882994381274313f, | ||
767 | -0.107254423828329604454f); | ||
768 | #elif POW_POLY_DEGREE == 3 | ||
769 | 163830 | mantissa = POLY2_AVX2_FMA(frac, | |
770 | 2.28330284476918490682f, | ||
771 | -1.04913055217340124191f, | ||
772 | 0.204446009836232697516f); | ||
773 | #else | ||
774 | #error | ||
775 | #endif | ||
776 | |||
777 | 65532 | logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); | |
778 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
779 | |||
780 | |||
781 | // Now calculate b*lna | ||
782 | 32766 | bVal = _mm256_loadu_ps(bPtr); | |
783 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
784 | |||
785 | // Now compute exp(b*lna) | ||
786 | 65532 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
787 | |||
788 | 32766 | fx = _mm256_fmadd_ps(bVal, log2EF, half); | |
789 | |||
790 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
791 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
792 | |||
793 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
794 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
795 | |||
796 | 32766 | tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); | |
797 | 32766 | bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); | |
798 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
799 | |||
800 | 32766 | y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); | |
801 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p2); | |
802 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p3); | |
803 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p4); | |
804 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p5); | |
805 | 32766 | y = _mm256_fmadd_ps(y, z, bVal); | |
806 | 32766 | y = _mm256_add_ps(y, one); | |
807 | |||
808 | emm0 = | ||
809 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
810 | |||
811 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
812 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
813 | |||
814 | _mm256_storeu_ps(cPtr, cVal); | ||
815 | |||
816 | 32766 | aPtr += 8; | |
817 | 32766 | bPtr += 8; | |
818 | 32766 | cPtr += 8; | |
819 | } | ||
820 | |||
821 | 2 | number = eighthPoints * 8; | |
822 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
823 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
824 | } | ||
825 | 2 | } | |
826 | |||
827 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ | ||
828 | |||
829 | #ifdef LV_HAVE_AVX2 | ||
830 | #include <immintrin.h> | ||
831 | |||
832 | #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) | ||
833 | #define POLY1_AVX2(x, c0, c1) \ | ||
834 | _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) | ||
835 | #define POLY2_AVX2(x, c0, c1, c2) \ | ||
836 | _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) | ||
837 | #define POLY3_AVX2(x, c0, c1, c2, c3) \ | ||
838 | _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) | ||
839 | #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ | ||
840 | _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) | ||
841 | #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
842 | _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) | ||
843 | |||
844 | 2 | static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector, | |
845 | const float* bVector, | ||
846 | const float* aVector, | ||
847 | unsigned int num_points) | ||
848 | { | ||
849 | 2 | float* cPtr = cVector; | |
850 | 2 | const float* bPtr = bVector; | |
851 | 2 | const float* aPtr = aVector; | |
852 | |||
853 | 2 | unsigned int number = 0; | |
854 | 2 | const unsigned int eighthPoints = num_points / 8; | |
855 | |||
856 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
857 | __m256 tmp, fx, mask, pow2n, z, y; | ||
858 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
859 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
860 | __m256i bias, exp, emm0, pi32_0x7f; | ||
861 | |||
862 | 2 | one = _mm256_set1_ps(1.0); | |
863 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
864 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
865 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
866 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
867 | 2 | half = _mm256_set1_ps(0.5); | |
868 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
869 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
870 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
871 | |||
872 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
873 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
874 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
875 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
876 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
877 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
878 | |||
879 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
880 | // First compute the logarithm | ||
881 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
882 | 32766 | bias = _mm256_set1_epi32(127); | |
883 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
884 | 163830 | exp = _mm256_sub_epi32( | |
885 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
886 | _mm256_set1_epi32(0x7f800000)), | ||
887 | 23), | ||
888 | bias); | ||
889 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
890 | |||
891 | 131064 | frac = _mm256_or_ps( | |
892 | leadingOne, | ||
893 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
894 | |||
895 | #if POW_POLY_DEGREE == 6 | ||
896 | mantissa = POLY5_AVX2(frac, | ||
897 | 3.1157899f, | ||
898 | -3.3241990f, | ||
899 | 2.5988452f, | ||
900 | -1.2315303f, | ||
901 | 3.1821337e-1f, | ||
902 | -3.4436006e-2f); | ||
903 | #elif POW_POLY_DEGREE == 5 | ||
904 | mantissa = POLY4_AVX2(frac, | ||
905 | 2.8882704548164776201f, | ||
906 | -2.52074962577807006663f, | ||
907 | 1.48116647521213171641f, | ||
908 | -0.465725644288844778798f, | ||
909 | 0.0596515482674574969533f); | ||
910 | #elif POW_POLY_DEGREE == 4 | ||
911 | mantissa = POLY3_AVX2(frac, | ||
912 | 2.61761038894603480148f, | ||
913 | -1.75647175389045657003f, | ||
914 | 0.688243882994381274313f, | ||
915 | -0.107254423828329604454f); | ||
916 | #elif POW_POLY_DEGREE == 3 | ||
917 | 229362 | mantissa = POLY2_AVX2(frac, | |
918 | 2.28330284476918490682f, | ||
919 | -1.04913055217340124191f, | ||
920 | 0.204446009836232697516f); | ||
921 | #else | ||
922 | #error | ||
923 | #endif | ||
924 | |||
925 | 98298 | logarithm = _mm256_add_ps( | |
926 | _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); | ||
927 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
928 | |||
929 | // Now calculate b*lna | ||
930 | 32766 | bVal = _mm256_loadu_ps(bPtr); | |
931 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
932 | |||
933 | // Now compute exp(b*lna) | ||
934 | 65532 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
935 | |||
936 | 65532 | fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); | |
937 | |||
938 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
939 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
940 | |||
941 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
942 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
943 | |||
944 | 65532 | tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); | |
945 | 65532 | bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); | |
946 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
947 | |||
948 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); | |
949 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); | |
950 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); | |
951 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); | |
952 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); | |
953 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); | |
954 | 32766 | y = _mm256_add_ps(y, one); | |
955 | |||
956 | emm0 = | ||
957 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
958 | |||
959 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
960 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
961 | |||
962 | _mm256_storeu_ps(cPtr, cVal); | ||
963 | |||
964 | 32766 | aPtr += 8; | |
965 | 32766 | bPtr += 8; | |
966 | 32766 | cPtr += 8; | |
967 | } | ||
968 | |||
969 | 2 | number = eighthPoints * 8; | |
970 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
971 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
972 | } | ||
973 | 2 | } | |
974 | |||
975 | #endif /* LV_HAVE_AVX2 for unaligned */ | ||
976 | |||
977 | #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */ | ||
978 |