| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32f_x2_pow_32f | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * Raises the sample in aVector to the power of the number in bVector. | ||
| 16 | * | ||
| 17 | * c[i] = pow(a[i], b[i]) | ||
| 18 | * | ||
| 19 | * <b>Dispatcher Prototype</b> | ||
| 20 | * \code | ||
| 21 | * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, | ||
| 22 | * unsigned int num_points) \endcode | ||
| 23 | * | ||
| 24 | * \b Inputs | ||
| 25 | * \li bVector: The input vector of indices (power values). | ||
| 26 | * \li aVector: The input vector of base values. | ||
| 27 | * \li num_points: The number of values in both input vectors. | ||
| 28 | * | ||
| 29 | * \b Outputs | ||
| 30 | * \li cVector: The output vector. | ||
| 31 | * | ||
| 32 | * \b Example | ||
| 33 | * Calculate the first two powers of two (2^x). | ||
| 34 | * \code | ||
| 35 | * int N = 10; | ||
| 36 | * unsigned int alignment = volk_get_alignment(); | ||
| 37 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
| 38 | * float* twos = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
| 39 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
| 40 | * | ||
| 41 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
| 42 | * increasing[ii] = (float)ii; | ||
| 43 | * twos[ii] = 2.f; | ||
| 44 | * } | ||
| 45 | * | ||
| 46 | * volk_32f_x2_pow_32f(out, increasing, twos, N); | ||
| 47 | * | ||
| 48 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
| 49 | * printf("out[%u] = %1.2f\n", ii, out[ii]); | ||
| 50 | * } | ||
| 51 | * | ||
| 52 | * volk_free(increasing); | ||
| 53 | * volk_free(twos); | ||
| 54 | * volk_free(out); | ||
| 55 | * \endcode | ||
| 56 | */ | ||
| 57 | |||
| 58 | #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H | ||
| 59 | #define INCLUDED_volk_32f_x2_pow_32f_a_H | ||
| 60 | |||
| 61 | #include <inttypes.h> | ||
| 62 | #include <math.h> | ||
| 63 | #include <stdio.h> | ||
| 64 | #include <stdlib.h> | ||
| 65 | |||
| 66 | #define POW_POLY_DEGREE 3 | ||
| 67 | |||
| 68 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
| 69 | #include <immintrin.h> | ||
| 70 | |||
| 71 | #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) | ||
| 72 | #define POLY1_AVX2_FMA(x, c0, c1) \ | ||
| 73 | _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) | ||
| 74 | #define POLY2_AVX2_FMA(x, c0, c1, c2) \ | ||
| 75 | _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) | ||
| 76 | #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ | ||
| 77 | _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) | ||
| 78 | #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ | ||
| 79 | _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) | ||
| 80 | #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ | ||
| 81 | _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) | ||
| 82 | |||
| 83 | 2 | static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, | |
| 84 | const float* bVector, | ||
| 85 | const float* aVector, | ||
| 86 | unsigned int num_points) | ||
| 87 | { | ||
| 88 | 2 | float* cPtr = cVector; | |
| 89 | 2 | const float* bPtr = bVector; | |
| 90 | 2 | const float* aPtr = aVector; | |
| 91 | |||
| 92 | 2 | unsigned int number = 0; | |
| 93 | 2 | const unsigned int eighthPoints = num_points / 8; | |
| 94 | |||
| 95 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
| 96 | __m256 tmp, fx, mask, pow2n, z, y; | ||
| 97 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
| 98 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
| 99 | __m256i bias, exp, emm0, pi32_0x7f; | ||
| 100 | |||
| 101 | 2 | one = _mm256_set1_ps(1.0); | |
| 102 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
| 103 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
| 104 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
| 105 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
| 106 | 2 | half = _mm256_set1_ps(0.5); | |
| 107 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
| 108 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
| 109 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
| 110 | |||
| 111 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
| 112 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
| 113 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
| 114 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
| 115 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
| 116 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
| 117 | |||
| 118 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
| 119 | // First compute the logarithm | ||
| 120 | 32766 | aVal = _mm256_load_ps(aPtr); | |
| 121 | 32766 | bias = _mm256_set1_epi32(127); | |
| 122 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
| 123 | 163830 | exp = _mm256_sub_epi32( | |
| 124 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
| 125 | _mm256_set1_epi32(0x7f800000)), | ||
| 126 | 23), | ||
| 127 | bias); | ||
| 128 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
| 129 | |||
| 130 | 131064 | frac = _mm256_or_ps( | |
| 131 | leadingOne, | ||
| 132 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
| 133 | |||
| 134 | #if POW_POLY_DEGREE == 6 | ||
| 135 | mantissa = POLY5_AVX2_FMA(frac, | ||
| 136 | 3.1157899f, | ||
| 137 | -3.3241990f, | ||
| 138 | 2.5988452f, | ||
| 139 | -1.2315303f, | ||
| 140 | 3.1821337e-1f, | ||
| 141 | -3.4436006e-2f); | ||
| 142 | #elif POW_POLY_DEGREE == 5 | ||
| 143 | mantissa = POLY4_AVX2_FMA(frac, | ||
| 144 | 2.8882704548164776201f, | ||
| 145 | -2.52074962577807006663f, | ||
| 146 | 1.48116647521213171641f, | ||
| 147 | -0.465725644288844778798f, | ||
| 148 | 0.0596515482674574969533f); | ||
| 149 | #elif POW_POLY_DEGREE == 4 | ||
| 150 | mantissa = POLY3_AVX2_FMA(frac, | ||
| 151 | 2.61761038894603480148f, | ||
| 152 | -1.75647175389045657003f, | ||
| 153 | 0.688243882994381274313f, | ||
| 154 | -0.107254423828329604454f); | ||
| 155 | #elif POW_POLY_DEGREE == 3 | ||
| 156 | 163830 | mantissa = POLY2_AVX2_FMA(frac, | |
| 157 | 2.28330284476918490682f, | ||
| 158 | -1.04913055217340124191f, | ||
| 159 | 0.204446009836232697516f); | ||
| 160 | #else | ||
| 161 | #error | ||
| 162 | #endif | ||
| 163 | |||
| 164 | 65532 | logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); | |
| 165 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
| 166 | |||
| 167 | // Now calculate b*lna | ||
| 168 | 32766 | bVal = _mm256_load_ps(bPtr); | |
| 169 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
| 170 | |||
| 171 | // Now compute exp(b*lna) | ||
| 172 | 65532 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
| 173 | |||
| 174 | 32766 | fx = _mm256_fmadd_ps(bVal, log2EF, half); | |
| 175 | |||
| 176 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
| 177 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
| 178 | |||
| 179 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
| 180 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
| 181 | |||
| 182 | 32766 | tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); | |
| 183 | 32766 | bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); | |
| 184 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
| 185 | |||
| 186 | 32766 | y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); | |
| 187 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p2); | |
| 188 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p3); | |
| 189 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p4); | |
| 190 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p5); | |
| 191 | 32766 | y = _mm256_fmadd_ps(y, z, bVal); | |
| 192 | 32766 | y = _mm256_add_ps(y, one); | |
| 193 | |||
| 194 | emm0 = | ||
| 195 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
| 196 | |||
| 197 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
| 198 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
| 199 | |||
| 200 | _mm256_store_ps(cPtr, cVal); | ||
| 201 | |||
| 202 | 32766 | aPtr += 8; | |
| 203 | 32766 | bPtr += 8; | |
| 204 | 32766 | cPtr += 8; | |
| 205 | } | ||
| 206 | |||
| 207 | 2 | number = eighthPoints * 8; | |
| 208 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 209 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
| 210 | } | ||
| 211 | 2 | } | |
| 212 | |||
| 213 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ | ||
| 214 | |||
| 215 | #ifdef LV_HAVE_AVX2 | ||
| 216 | #include <immintrin.h> | ||
| 217 | |||
| 218 | #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) | ||
| 219 | #define POLY1_AVX2(x, c0, c1) \ | ||
| 220 | _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) | ||
| 221 | #define POLY2_AVX2(x, c0, c1, c2) \ | ||
| 222 | _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) | ||
| 223 | #define POLY3_AVX2(x, c0, c1, c2, c3) \ | ||
| 224 | _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) | ||
| 225 | #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ | ||
| 226 | _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) | ||
| 227 | #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
| 228 | _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) | ||
| 229 | |||
| 230 | 2 | static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector, | |
| 231 | const float* bVector, | ||
| 232 | const float* aVector, | ||
| 233 | unsigned int num_points) | ||
| 234 | { | ||
| 235 | 2 | float* cPtr = cVector; | |
| 236 | 2 | const float* bPtr = bVector; | |
| 237 | 2 | const float* aPtr = aVector; | |
| 238 | |||
| 239 | 2 | unsigned int number = 0; | |
| 240 | 2 | const unsigned int eighthPoints = num_points / 8; | |
| 241 | |||
| 242 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
| 243 | __m256 tmp, fx, mask, pow2n, z, y; | ||
| 244 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
| 245 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
| 246 | __m256i bias, exp, emm0, pi32_0x7f; | ||
| 247 | |||
| 248 | 2 | one = _mm256_set1_ps(1.0); | |
| 249 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
| 250 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
| 251 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
| 252 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
| 253 | 2 | half = _mm256_set1_ps(0.5); | |
| 254 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
| 255 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
| 256 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
| 257 | |||
| 258 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
| 259 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
| 260 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
| 261 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
| 262 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
| 263 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
| 264 | |||
| 265 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
| 266 | // First compute the logarithm | ||
| 267 | 32766 | aVal = _mm256_load_ps(aPtr); | |
| 268 | 32766 | bias = _mm256_set1_epi32(127); | |
| 269 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
| 270 | 163830 | exp = _mm256_sub_epi32( | |
| 271 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
| 272 | _mm256_set1_epi32(0x7f800000)), | ||
| 273 | 23), | ||
| 274 | bias); | ||
| 275 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
| 276 | |||
| 277 | 131064 | frac = _mm256_or_ps( | |
| 278 | leadingOne, | ||
| 279 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
| 280 | |||
| 281 | #if POW_POLY_DEGREE == 6 | ||
| 282 | mantissa = POLY5_AVX2(frac, | ||
| 283 | 3.1157899f, | ||
| 284 | -3.3241990f, | ||
| 285 | 2.5988452f, | ||
| 286 | -1.2315303f, | ||
| 287 | 3.1821337e-1f, | ||
| 288 | -3.4436006e-2f); | ||
| 289 | #elif POW_POLY_DEGREE == 5 | ||
| 290 | mantissa = POLY4_AVX2(frac, | ||
| 291 | 2.8882704548164776201f, | ||
| 292 | -2.52074962577807006663f, | ||
| 293 | 1.48116647521213171641f, | ||
| 294 | -0.465725644288844778798f, | ||
| 295 | 0.0596515482674574969533f); | ||
| 296 | #elif POW_POLY_DEGREE == 4 | ||
| 297 | mantissa = POLY3_AVX2(frac, | ||
| 298 | 2.61761038894603480148f, | ||
| 299 | -1.75647175389045657003f, | ||
| 300 | 0.688243882994381274313f, | ||
| 301 | -0.107254423828329604454f); | ||
| 302 | #elif POW_POLY_DEGREE == 3 | ||
| 303 | 229362 | mantissa = POLY2_AVX2(frac, | |
| 304 | 2.28330284476918490682f, | ||
| 305 | -1.04913055217340124191f, | ||
| 306 | 0.204446009836232697516f); | ||
| 307 | #else | ||
| 308 | #error | ||
| 309 | #endif | ||
| 310 | |||
| 311 | 98298 | logarithm = _mm256_add_ps( | |
| 312 | _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); | ||
| 313 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
| 314 | |||
| 315 | // Now calculate b*lna | ||
| 316 | 32766 | bVal = _mm256_load_ps(bPtr); | |
| 317 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
| 318 | |||
| 319 | // Now compute exp(b*lna) | ||
| 320 | 98298 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
| 321 | |||
| 322 | 65532 | fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); | |
| 323 | |||
| 324 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
| 325 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
| 326 | |||
| 327 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
| 328 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
| 329 | |||
| 330 | 65532 | tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); | |
| 331 | 65532 | bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); | |
| 332 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
| 333 | |||
| 334 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); | |
| 335 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); | |
| 336 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); | |
| 337 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); | |
| 338 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); | |
| 339 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); | |
| 340 | 32766 | y = _mm256_add_ps(y, one); | |
| 341 | |||
| 342 | emm0 = | ||
| 343 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
| 344 | |||
| 345 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
| 346 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
| 347 | |||
| 348 | _mm256_store_ps(cPtr, cVal); | ||
| 349 | |||
| 350 | 32766 | aPtr += 8; | |
| 351 | 32766 | bPtr += 8; | |
| 352 | 32766 | cPtr += 8; | |
| 353 | } | ||
| 354 | |||
| 355 | 2 | number = eighthPoints * 8; | |
| 356 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 357 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
| 358 | } | ||
| 359 | 2 | } | |
| 360 | |||
| 361 | #endif /* LV_HAVE_AVX2 for aligned */ | ||
| 362 | |||
| 363 | |||
| 364 | #ifdef LV_HAVE_SSE4_1 | ||
| 365 | #include <smmintrin.h> | ||
| 366 | |||
| 367 | #define POLY0(x, c0) _mm_set1_ps(c0) | ||
| 368 | #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) | ||
| 369 | #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) | ||
| 370 | #define POLY3(x, c0, c1, c2, c3) \ | ||
| 371 | _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) | ||
| 372 | #define POLY4(x, c0, c1, c2, c3, c4) \ | ||
| 373 | _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) | ||
| 374 | #define POLY5(x, c0, c1, c2, c3, c4, c5) \ | ||
| 375 | _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) | ||
| 376 | |||
| 377 | 2 | static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, | |
| 378 | const float* bVector, | ||
| 379 | const float* aVector, | ||
| 380 | unsigned int num_points) | ||
| 381 | { | ||
| 382 | 2 | float* cPtr = cVector; | |
| 383 | 2 | const float* bPtr = bVector; | |
| 384 | 2 | const float* aPtr = aVector; | |
| 385 | |||
| 386 | 2 | unsigned int number = 0; | |
| 387 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 388 | |||
| 389 | __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
| 390 | __m128 tmp, fx, mask, pow2n, z, y; | ||
| 391 | __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
| 392 | __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
| 393 | __m128i bias, exp, emm0, pi32_0x7f; | ||
| 394 | |||
| 395 | 2 | one = _mm_set1_ps(1.0); | |
| 396 | 2 | exp_hi = _mm_set1_ps(88.3762626647949); | |
| 397 | 2 | exp_lo = _mm_set1_ps(-88.3762626647949); | |
| 398 | 2 | ln2 = _mm_set1_ps(0.6931471805); | |
| 399 | 2 | log2EF = _mm_set1_ps(1.44269504088896341); | |
| 400 | 2 | half = _mm_set1_ps(0.5); | |
| 401 | 2 | exp_C1 = _mm_set1_ps(0.693359375); | |
| 402 | 2 | exp_C2 = _mm_set1_ps(-2.12194440e-4); | |
| 403 | 2 | pi32_0x7f = _mm_set1_epi32(0x7f); | |
| 404 | |||
| 405 | 2 | exp_p0 = _mm_set1_ps(1.9875691500e-4); | |
| 406 | 2 | exp_p1 = _mm_set1_ps(1.3981999507e-3); | |
| 407 | 2 | exp_p2 = _mm_set1_ps(8.3334519073e-3); | |
| 408 | 2 | exp_p3 = _mm_set1_ps(4.1665795894e-2); | |
| 409 | 2 | exp_p4 = _mm_set1_ps(1.6666665459e-1); | |
| 410 | 2 | exp_p5 = _mm_set1_ps(5.0000001201e-1); | |
| 411 | |||
| 412 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 413 | // First compute the logarithm | ||
| 414 | 65534 | aVal = _mm_load_ps(aPtr); | |
| 415 | 65534 | bias = _mm_set1_epi32(127); | |
| 416 | 65534 | leadingOne = _mm_set1_ps(1.0f); | |
| 417 | 327670 | exp = _mm_sub_epi32( | |
| 418 | _mm_srli_epi32( | ||
| 419 | _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), | ||
| 420 | bias); | ||
| 421 | 65534 | logarithm = _mm_cvtepi32_ps(exp); | |
| 422 | |||
| 423 | 262136 | frac = _mm_or_ps(leadingOne, | |
| 424 | _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); | ||
| 425 | |||
| 426 | #if POW_POLY_DEGREE == 6 | ||
| 427 | mantissa = POLY5(frac, | ||
| 428 | 3.1157899f, | ||
| 429 | -3.3241990f, | ||
| 430 | 2.5988452f, | ||
| 431 | -1.2315303f, | ||
| 432 | 3.1821337e-1f, | ||
| 433 | -3.4436006e-2f); | ||
| 434 | #elif POW_POLY_DEGREE == 5 | ||
| 435 | mantissa = POLY4(frac, | ||
| 436 | 2.8882704548164776201f, | ||
| 437 | -2.52074962577807006663f, | ||
| 438 | 1.48116647521213171641f, | ||
| 439 | -0.465725644288844778798f, | ||
| 440 | 0.0596515482674574969533f); | ||
| 441 | #elif POW_POLY_DEGREE == 4 | ||
| 442 | mantissa = POLY3(frac, | ||
| 443 | 2.61761038894603480148f, | ||
| 444 | -1.75647175389045657003f, | ||
| 445 | 0.688243882994381274313f, | ||
| 446 | -0.107254423828329604454f); | ||
| 447 | #elif POW_POLY_DEGREE == 3 | ||
| 448 | 458738 | mantissa = POLY2(frac, | |
| 449 | 2.28330284476918490682f, | ||
| 450 | -1.04913055217340124191f, | ||
| 451 | 0.204446009836232697516f); | ||
| 452 | #else | ||
| 453 | #error | ||
| 454 | #endif | ||
| 455 | |||
| 456 | logarithm = | ||
| 457 | 196602 | _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); | |
| 458 | 65534 | logarithm = _mm_mul_ps(logarithm, ln2); | |
| 459 | |||
| 460 | |||
| 461 | // Now calculate b*lna | ||
| 462 | 65534 | bVal = _mm_load_ps(bPtr); | |
| 463 | 65534 | bVal = _mm_mul_ps(bVal, logarithm); | |
| 464 | |||
| 465 | // Now compute exp(b*lna) | ||
| 466 | 131068 | bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); | |
| 467 | |||
| 468 | 131068 | fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); | |
| 469 | |||
| 470 | 65534 | emm0 = _mm_cvttps_epi32(fx); | |
| 471 | 65534 | tmp = _mm_cvtepi32_ps(emm0); | |
| 472 | |||
| 473 | 131068 | mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); | |
| 474 | 65534 | fx = _mm_sub_ps(tmp, mask); | |
| 475 | |||
| 476 | 65534 | tmp = _mm_mul_ps(fx, exp_C1); | |
| 477 | 65534 | z = _mm_mul_ps(fx, exp_C2); | |
| 478 | 131068 | bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); | |
| 479 | 65534 | z = _mm_mul_ps(bVal, bVal); | |
| 480 | |||
| 481 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); | |
| 482 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); | |
| 483 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); | |
| 484 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); | |
| 485 | 65534 | y = _mm_add_ps(y, one); | |
| 486 | |||
| 487 | 196602 | emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); | |
| 488 | |||
| 489 | 65534 | pow2n = _mm_castsi128_ps(emm0); | |
| 490 | 65534 | cVal = _mm_mul_ps(y, pow2n); | |
| 491 | |||
| 492 | _mm_store_ps(cPtr, cVal); | ||
| 493 | |||
| 494 | 65534 | aPtr += 4; | |
| 495 | 65534 | bPtr += 4; | |
| 496 | 65534 | cPtr += 4; | |
| 497 | } | ||
| 498 | |||
| 499 | 2 | number = quarterPoints * 4; | |
| 500 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
| 501 | 6 | *cPtr++ = powf(*aPtr++, *bPtr++); | |
| 502 | } | ||
| 503 | 2 | } | |
| 504 | |||
| 505 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
| 506 | |||
| 507 | #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */ | ||
| 508 | |||
| 509 | #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H | ||
| 510 | #define INCLUDED_volk_32f_x2_pow_32f_u_H | ||
| 511 | |||
| 512 | #include <inttypes.h> | ||
| 513 | #include <math.h> | ||
| 514 | #include <stdio.h> | ||
| 515 | #include <stdlib.h> | ||
| 516 | |||
| 517 | #define POW_POLY_DEGREE 3 | ||
| 518 | |||
| 519 | #ifdef LV_HAVE_GENERIC | ||
| 520 | |||
| 521 | 2 | static inline void volk_32f_x2_pow_32f_generic(float* cVector, | |
| 522 | const float* bVector, | ||
| 523 | const float* aVector, | ||
| 524 | unsigned int num_points) | ||
| 525 | { | ||
| 526 | 2 | float* cPtr = cVector; | |
| 527 | 2 | const float* bPtr = bVector; | |
| 528 | 2 | const float* aPtr = aVector; | |
| 529 | 2 | unsigned int number = 0; | |
| 530 | |||
| 531 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
| 532 | 262142 | *cPtr++ = powf(*aPtr++, *bPtr++); | |
| 533 | } | ||
| 534 | 2 | } | |
| 535 | #endif /* LV_HAVE_GENERIC */ | ||
| 536 | |||
| 537 | |||
| 538 | #ifdef LV_HAVE_SSE4_1 | ||
| 539 | #include <smmintrin.h> | ||
| 540 | |||
| 541 | #define POLY0(x, c0) _mm_set1_ps(c0) | ||
| 542 | #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) | ||
| 543 | #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) | ||
| 544 | #define POLY3(x, c0, c1, c2, c3) \ | ||
| 545 | _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) | ||
| 546 | #define POLY4(x, c0, c1, c2, c3, c4) \ | ||
| 547 | _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) | ||
| 548 | #define POLY5(x, c0, c1, c2, c3, c4, c5) \ | ||
| 549 | _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) | ||
| 550 | |||
| 551 | 2 | static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, | |
| 552 | const float* bVector, | ||
| 553 | const float* aVector, | ||
| 554 | unsigned int num_points) | ||
| 555 | { | ||
| 556 | 2 | float* cPtr = cVector; | |
| 557 | 2 | const float* bPtr = bVector; | |
| 558 | 2 | const float* aPtr = aVector; | |
| 559 | |||
| 560 | 2 | unsigned int number = 0; | |
| 561 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 562 | |||
| 563 | __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
| 564 | __m128 tmp, fx, mask, pow2n, z, y; | ||
| 565 | __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
| 566 | __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
| 567 | __m128i bias, exp, emm0, pi32_0x7f; | ||
| 568 | |||
| 569 | 2 | one = _mm_set1_ps(1.0); | |
| 570 | 2 | exp_hi = _mm_set1_ps(88.3762626647949); | |
| 571 | 2 | exp_lo = _mm_set1_ps(-88.3762626647949); | |
| 572 | 2 | ln2 = _mm_set1_ps(0.6931471805); | |
| 573 | 2 | log2EF = _mm_set1_ps(1.44269504088896341); | |
| 574 | 2 | half = _mm_set1_ps(0.5); | |
| 575 | 2 | exp_C1 = _mm_set1_ps(0.693359375); | |
| 576 | 2 | exp_C2 = _mm_set1_ps(-2.12194440e-4); | |
| 577 | 2 | pi32_0x7f = _mm_set1_epi32(0x7f); | |
| 578 | |||
| 579 | 2 | exp_p0 = _mm_set1_ps(1.9875691500e-4); | |
| 580 | 2 | exp_p1 = _mm_set1_ps(1.3981999507e-3); | |
| 581 | 2 | exp_p2 = _mm_set1_ps(8.3334519073e-3); | |
| 582 | 2 | exp_p3 = _mm_set1_ps(4.1665795894e-2); | |
| 583 | 2 | exp_p4 = _mm_set1_ps(1.6666665459e-1); | |
| 584 | 2 | exp_p5 = _mm_set1_ps(5.0000001201e-1); | |
| 585 | |||
| 586 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 587 | // First compute the logarithm | ||
| 588 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
| 589 | 65534 | bias = _mm_set1_epi32(127); | |
| 590 | 65534 | leadingOne = _mm_set1_ps(1.0f); | |
| 591 | 327670 | exp = _mm_sub_epi32( | |
| 592 | _mm_srli_epi32( | ||
| 593 | _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), | ||
| 594 | bias); | ||
| 595 | 65534 | logarithm = _mm_cvtepi32_ps(exp); | |
| 596 | |||
| 597 | 262136 | frac = _mm_or_ps(leadingOne, | |
| 598 | _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); | ||
| 599 | |||
| 600 | #if POW_POLY_DEGREE == 6 | ||
| 601 | mantissa = POLY5(frac, | ||
| 602 | 3.1157899f, | ||
| 603 | -3.3241990f, | ||
| 604 | 2.5988452f, | ||
| 605 | -1.2315303f, | ||
| 606 | 3.1821337e-1f, | ||
| 607 | -3.4436006e-2f); | ||
| 608 | #elif POW_POLY_DEGREE == 5 | ||
| 609 | mantissa = POLY4(frac, | ||
| 610 | 2.8882704548164776201f, | ||
| 611 | -2.52074962577807006663f, | ||
| 612 | 1.48116647521213171641f, | ||
| 613 | -0.465725644288844778798f, | ||
| 614 | 0.0596515482674574969533f); | ||
| 615 | #elif POW_POLY_DEGREE == 4 | ||
| 616 | mantissa = POLY3(frac, | ||
| 617 | 2.61761038894603480148f, | ||
| 618 | -1.75647175389045657003f, | ||
| 619 | 0.688243882994381274313f, | ||
| 620 | -0.107254423828329604454f); | ||
| 621 | #elif POW_POLY_DEGREE == 3 | ||
| 622 | 458738 | mantissa = POLY2(frac, | |
| 623 | 2.28330284476918490682f, | ||
| 624 | -1.04913055217340124191f, | ||
| 625 | 0.204446009836232697516f); | ||
| 626 | #else | ||
| 627 | #error | ||
| 628 | #endif | ||
| 629 | |||
| 630 | logarithm = | ||
| 631 | 196602 | _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); | |
| 632 | 65534 | logarithm = _mm_mul_ps(logarithm, ln2); | |
| 633 | |||
| 634 | |||
| 635 | // Now calculate b*lna | ||
| 636 | 65534 | bVal = _mm_loadu_ps(bPtr); | |
| 637 | 65534 | bVal = _mm_mul_ps(bVal, logarithm); | |
| 638 | |||
| 639 | // Now compute exp(b*lna) | ||
| 640 | 131068 | bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); | |
| 641 | |||
| 642 | 131068 | fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); | |
| 643 | |||
| 644 | 65534 | emm0 = _mm_cvttps_epi32(fx); | |
| 645 | 65534 | tmp = _mm_cvtepi32_ps(emm0); | |
| 646 | |||
| 647 | 131068 | mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); | |
| 648 | 65534 | fx = _mm_sub_ps(tmp, mask); | |
| 649 | |||
| 650 | 65534 | tmp = _mm_mul_ps(fx, exp_C1); | |
| 651 | 65534 | z = _mm_mul_ps(fx, exp_C2); | |
| 652 | 131068 | bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); | |
| 653 | 65534 | z = _mm_mul_ps(bVal, bVal); | |
| 654 | |||
| 655 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); | |
| 656 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); | |
| 657 | 196602 | y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); | |
| 658 | 196602 | y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); | |
| 659 | 65534 | y = _mm_add_ps(y, one); | |
| 660 | |||
| 661 | 196602 | emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); | |
| 662 | |||
| 663 | 65534 | pow2n = _mm_castsi128_ps(emm0); | |
| 664 | 65534 | cVal = _mm_mul_ps(y, pow2n); | |
| 665 | |||
| 666 | _mm_storeu_ps(cPtr, cVal); | ||
| 667 | |||
| 668 | 65534 | aPtr += 4; | |
| 669 | 65534 | bPtr += 4; | |
| 670 | 65534 | cPtr += 4; | |
| 671 | } | ||
| 672 | |||
| 673 | 2 | number = quarterPoints * 4; | |
| 674 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
| 675 | 6 | *cPtr++ = powf(*aPtr++, *bPtr++); | |
| 676 | } | ||
| 677 | 2 | } | |
| 678 | |||
| 679 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
| 680 | |||
| 681 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
| 682 | #include <immintrin.h> | ||
| 683 | |||
| 684 | #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) | ||
| 685 | #define POLY1_AVX2_FMA(x, c0, c1) \ | ||
| 686 | _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) | ||
| 687 | #define POLY2_AVX2_FMA(x, c0, c1, c2) \ | ||
| 688 | _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) | ||
| 689 | #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ | ||
| 690 | _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) | ||
| 691 | #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ | ||
| 692 | _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) | ||
| 693 | #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ | ||
| 694 | _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) | ||
| 695 | |||
| 696 | 2 | static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, | |
| 697 | const float* bVector, | ||
| 698 | const float* aVector, | ||
| 699 | unsigned int num_points) | ||
| 700 | { | ||
| 701 | 2 | float* cPtr = cVector; | |
| 702 | 2 | const float* bPtr = bVector; | |
| 703 | 2 | const float* aPtr = aVector; | |
| 704 | |||
| 705 | 2 | unsigned int number = 0; | |
| 706 | 2 | const unsigned int eighthPoints = num_points / 8; | |
| 707 | |||
| 708 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
| 709 | __m256 tmp, fx, mask, pow2n, z, y; | ||
| 710 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
| 711 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
| 712 | __m256i bias, exp, emm0, pi32_0x7f; | ||
| 713 | |||
| 714 | 2 | one = _mm256_set1_ps(1.0); | |
| 715 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
| 716 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
| 717 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
| 718 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
| 719 | 2 | half = _mm256_set1_ps(0.5); | |
| 720 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
| 721 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
| 722 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
| 723 | |||
| 724 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
| 725 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
| 726 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
| 727 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
| 728 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
| 729 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
| 730 | |||
| 731 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
| 732 | // First compute the logarithm | ||
| 733 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
| 734 | 32766 | bias = _mm256_set1_epi32(127); | |
| 735 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
| 736 | 163830 | exp = _mm256_sub_epi32( | |
| 737 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
| 738 | _mm256_set1_epi32(0x7f800000)), | ||
| 739 | 23), | ||
| 740 | bias); | ||
| 741 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
| 742 | |||
| 743 | 131064 | frac = _mm256_or_ps( | |
| 744 | leadingOne, | ||
| 745 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
| 746 | |||
| 747 | #if POW_POLY_DEGREE == 6 | ||
| 748 | mantissa = POLY5_AVX2_FMA(frac, | ||
| 749 | 3.1157899f, | ||
| 750 | -3.3241990f, | ||
| 751 | 2.5988452f, | ||
| 752 | -1.2315303f, | ||
| 753 | 3.1821337e-1f, | ||
| 754 | -3.4436006e-2f); | ||
| 755 | #elif POW_POLY_DEGREE == 5 | ||
| 756 | mantissa = POLY4_AVX2_FMA(frac, | ||
| 757 | 2.8882704548164776201f, | ||
| 758 | -2.52074962577807006663f, | ||
| 759 | 1.48116647521213171641f, | ||
| 760 | -0.465725644288844778798f, | ||
| 761 | 0.0596515482674574969533f); | ||
| 762 | #elif POW_POLY_DEGREE == 4 | ||
| 763 | mantissa = POLY3_AVX2_FMA(frac, | ||
| 764 | 2.61761038894603480148f, | ||
| 765 | -1.75647175389045657003f, | ||
| 766 | 0.688243882994381274313f, | ||
| 767 | -0.107254423828329604454f); | ||
| 768 | #elif POW_POLY_DEGREE == 3 | ||
| 769 | 163830 | mantissa = POLY2_AVX2_FMA(frac, | |
| 770 | 2.28330284476918490682f, | ||
| 771 | -1.04913055217340124191f, | ||
| 772 | 0.204446009836232697516f); | ||
| 773 | #else | ||
| 774 | #error | ||
| 775 | #endif | ||
| 776 | |||
| 777 | 65532 | logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); | |
| 778 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
| 779 | |||
| 780 | |||
| 781 | // Now calculate b*lna | ||
| 782 | 32766 | bVal = _mm256_loadu_ps(bPtr); | |
| 783 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
| 784 | |||
| 785 | // Now compute exp(b*lna) | ||
| 786 | 65532 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
| 787 | |||
| 788 | 32766 | fx = _mm256_fmadd_ps(bVal, log2EF, half); | |
| 789 | |||
| 790 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
| 791 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
| 792 | |||
| 793 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
| 794 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
| 795 | |||
| 796 | 32766 | tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); | |
| 797 | 32766 | bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); | |
| 798 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
| 799 | |||
| 800 | 32766 | y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); | |
| 801 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p2); | |
| 802 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p3); | |
| 803 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p4); | |
| 804 | 32766 | y = _mm256_fmadd_ps(y, bVal, exp_p5); | |
| 805 | 32766 | y = _mm256_fmadd_ps(y, z, bVal); | |
| 806 | 32766 | y = _mm256_add_ps(y, one); | |
| 807 | |||
| 808 | emm0 = | ||
| 809 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
| 810 | |||
| 811 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
| 812 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
| 813 | |||
| 814 | _mm256_storeu_ps(cPtr, cVal); | ||
| 815 | |||
| 816 | 32766 | aPtr += 8; | |
| 817 | 32766 | bPtr += 8; | |
| 818 | 32766 | cPtr += 8; | |
| 819 | } | ||
| 820 | |||
| 821 | 2 | number = eighthPoints * 8; | |
| 822 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 823 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
| 824 | } | ||
| 825 | 2 | } | |
| 826 | |||
| 827 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ | ||
| 828 | |||
| 829 | #ifdef LV_HAVE_AVX2 | ||
| 830 | #include <immintrin.h> | ||
| 831 | |||
| 832 | #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) | ||
| 833 | #define POLY1_AVX2(x, c0, c1) \ | ||
| 834 | _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) | ||
| 835 | #define POLY2_AVX2(x, c0, c1, c2) \ | ||
| 836 | _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) | ||
| 837 | #define POLY3_AVX2(x, c0, c1, c2, c3) \ | ||
| 838 | _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) | ||
| 839 | #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ | ||
| 840 | _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) | ||
| 841 | #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
| 842 | _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) | ||
| 843 | |||
| 844 | 2 | static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector, | |
| 845 | const float* bVector, | ||
| 846 | const float* aVector, | ||
| 847 | unsigned int num_points) | ||
| 848 | { | ||
| 849 | 2 | float* cPtr = cVector; | |
| 850 | 2 | const float* bPtr = bVector; | |
| 851 | 2 | const float* aPtr = aVector; | |
| 852 | |||
| 853 | 2 | unsigned int number = 0; | |
| 854 | 2 | const unsigned int eighthPoints = num_points / 8; | |
| 855 | |||
| 856 | __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; | ||
| 857 | __m256 tmp, fx, mask, pow2n, z, y; | ||
| 858 | __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; | ||
| 859 | __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; | ||
| 860 | __m256i bias, exp, emm0, pi32_0x7f; | ||
| 861 | |||
| 862 | 2 | one = _mm256_set1_ps(1.0); | |
| 863 | 2 | exp_hi = _mm256_set1_ps(88.3762626647949); | |
| 864 | 2 | exp_lo = _mm256_set1_ps(-88.3762626647949); | |
| 865 | 2 | ln2 = _mm256_set1_ps(0.6931471805); | |
| 866 | 2 | log2EF = _mm256_set1_ps(1.44269504088896341); | |
| 867 | 2 | half = _mm256_set1_ps(0.5); | |
| 868 | 2 | exp_C1 = _mm256_set1_ps(0.693359375); | |
| 869 | 2 | exp_C2 = _mm256_set1_ps(-2.12194440e-4); | |
| 870 | 2 | pi32_0x7f = _mm256_set1_epi32(0x7f); | |
| 871 | |||
| 872 | 2 | exp_p0 = _mm256_set1_ps(1.9875691500e-4); | |
| 873 | 2 | exp_p1 = _mm256_set1_ps(1.3981999507e-3); | |
| 874 | 2 | exp_p2 = _mm256_set1_ps(8.3334519073e-3); | |
| 875 | 2 | exp_p3 = _mm256_set1_ps(4.1665795894e-2); | |
| 876 | 2 | exp_p4 = _mm256_set1_ps(1.6666665459e-1); | |
| 877 | 2 | exp_p5 = _mm256_set1_ps(5.0000001201e-1); | |
| 878 | |||
| 879 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
| 880 | // First compute the logarithm | ||
| 881 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
| 882 | 32766 | bias = _mm256_set1_epi32(127); | |
| 883 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
| 884 | 163830 | exp = _mm256_sub_epi32( | |
| 885 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
| 886 | _mm256_set1_epi32(0x7f800000)), | ||
| 887 | 23), | ||
| 888 | bias); | ||
| 889 | 32766 | logarithm = _mm256_cvtepi32_ps(exp); | |
| 890 | |||
| 891 | 131064 | frac = _mm256_or_ps( | |
| 892 | leadingOne, | ||
| 893 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
| 894 | |||
| 895 | #if POW_POLY_DEGREE == 6 | ||
| 896 | mantissa = POLY5_AVX2(frac, | ||
| 897 | 3.1157899f, | ||
| 898 | -3.3241990f, | ||
| 899 | 2.5988452f, | ||
| 900 | -1.2315303f, | ||
| 901 | 3.1821337e-1f, | ||
| 902 | -3.4436006e-2f); | ||
| 903 | #elif POW_POLY_DEGREE == 5 | ||
| 904 | mantissa = POLY4_AVX2(frac, | ||
| 905 | 2.8882704548164776201f, | ||
| 906 | -2.52074962577807006663f, | ||
| 907 | 1.48116647521213171641f, | ||
| 908 | -0.465725644288844778798f, | ||
| 909 | 0.0596515482674574969533f); | ||
| 910 | #elif POW_POLY_DEGREE == 4 | ||
| 911 | mantissa = POLY3_AVX2(frac, | ||
| 912 | 2.61761038894603480148f, | ||
| 913 | -1.75647175389045657003f, | ||
| 914 | 0.688243882994381274313f, | ||
| 915 | -0.107254423828329604454f); | ||
| 916 | #elif POW_POLY_DEGREE == 3 | ||
| 917 | 229362 | mantissa = POLY2_AVX2(frac, | |
| 918 | 2.28330284476918490682f, | ||
| 919 | -1.04913055217340124191f, | ||
| 920 | 0.204446009836232697516f); | ||
| 921 | #else | ||
| 922 | #error | ||
| 923 | #endif | ||
| 924 | |||
| 925 | 98298 | logarithm = _mm256_add_ps( | |
| 926 | _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); | ||
| 927 | 32766 | logarithm = _mm256_mul_ps(logarithm, ln2); | |
| 928 | |||
| 929 | // Now calculate b*lna | ||
| 930 | 32766 | bVal = _mm256_loadu_ps(bPtr); | |
| 931 | 32766 | bVal = _mm256_mul_ps(bVal, logarithm); | |
| 932 | |||
| 933 | // Now compute exp(b*lna) | ||
| 934 | 65532 | bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); | |
| 935 | |||
| 936 | 65532 | fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); | |
| 937 | |||
| 938 | 32766 | emm0 = _mm256_cvttps_epi32(fx); | |
| 939 | 32766 | tmp = _mm256_cvtepi32_ps(emm0); | |
| 940 | |||
| 941 | 65532 | mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); | |
| 942 | 32766 | fx = _mm256_sub_ps(tmp, mask); | |
| 943 | |||
| 944 | 65532 | tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); | |
| 945 | 65532 | bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); | |
| 946 | 32766 | z = _mm256_mul_ps(bVal, bVal); | |
| 947 | |||
| 948 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); | |
| 949 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); | |
| 950 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); | |
| 951 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); | |
| 952 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); | |
| 953 | 65532 | y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); | |
| 954 | 32766 | y = _mm256_add_ps(y, one); | |
| 955 | |||
| 956 | emm0 = | ||
| 957 | 98298 | _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); | |
| 958 | |||
| 959 | 32766 | pow2n = _mm256_castsi256_ps(emm0); | |
| 960 | 32766 | cVal = _mm256_mul_ps(y, pow2n); | |
| 961 | |||
| 962 | _mm256_storeu_ps(cPtr, cVal); | ||
| 963 | |||
| 964 | 32766 | aPtr += 8; | |
| 965 | 32766 | bPtr += 8; | |
| 966 | 32766 | cPtr += 8; | |
| 967 | } | ||
| 968 | |||
| 969 | 2 | number = eighthPoints * 8; | |
| 970 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 971 | 14 | *cPtr++ = pow(*aPtr++, *bPtr++); | |
| 972 | } | ||
| 973 | 2 | } | |
| 974 | |||
| 975 | #endif /* LV_HAVE_AVX2 for unaligned */ | ||
| 976 | |||
| 977 | #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */ | ||
| 978 |