Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_log2_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Computes base 2 log of input vector and stores results in output vector. | ||
16 | * | ||
17 | * Note that this implementation is not conforming to the IEEE FP standard, i.e., | ||
18 | * +-Inf outputs are mapped to +-127.0f and +-NaN input values are not supported. | ||
19 | * | ||
20 | * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation | ||
21 | * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm | ||
22 | * | ||
23 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
24 | * copy of this software and associated documentation files (the | ||
25 | * "Software"), to deal in the Software without restriction, including | ||
26 | * without limitation the rights to use, copy, modify, merge, publish, | ||
27 | * distribute, sub license, and/or sell copies of the Software, and to | ||
28 | * permit persons to whom the Software is furnished to do so, subject to | ||
29 | * the following conditions: | ||
30 | * | ||
31 | * The above copyright notice and this permission notice (including the | ||
32 | * next paragraph) shall be included in all copies or substantial portions | ||
33 | * of the Software. | ||
34 | * | ||
35 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | ||
36 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
37 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. | ||
38 | * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR | ||
39 | * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
40 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
41 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
42 | * | ||
43 | * This is the MIT License (MIT) | ||
44 | * | ||
45 | * <b>Dispatcher Prototype</b> | ||
46 | * \code | ||
47 | * void volk_32f_log2_32f(float* bVector, const float* aVector, unsigned int num_points) | ||
48 | * \endcode | ||
49 | * | ||
50 | * \b Inputs | ||
51 | * \li aVector: the input vector of floats. | ||
52 | * \li num_points: The number of data points. | ||
53 | * | ||
54 | * \b Outputs | ||
55 | * \li bVector: The output vector. | ||
56 | * | ||
57 | * \b Example | ||
58 | * \code | ||
59 | * int N = 10; | ||
60 | * unsigned int alignment = volk_get_alignment(); | ||
61 | * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
62 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
63 | * | ||
64 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
65 | * in[ii] = std::pow(2.f,((float)ii)); | ||
66 | * } | ||
67 | * | ||
68 | * volk_32f_log2_32f(out, in, N); | ||
69 | * | ||
70 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
71 | * printf("out(%i) = %f\n", ii, out[ii]); | ||
72 | * } | ||
73 | * | ||
74 | * volk_free(in); | ||
75 | * volk_free(out); | ||
76 | * \endcode | ||
77 | */ | ||
78 | |||
79 | #ifndef INCLUDED_volk_32f_log2_32f_a_H | ||
80 | #define INCLUDED_volk_32f_log2_32f_a_H | ||
81 | |||
82 | #include <inttypes.h> | ||
83 | #include <math.h> | ||
84 | #include <stdio.h> | ||
85 | #include <stdlib.h> | ||
86 | |||
87 | #define LOG_POLY_DEGREE 6 | ||
88 | |||
89 | #ifdef LV_HAVE_GENERIC | ||
90 | |||
91 | static inline void | ||
92 | 10 | volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points) | |
93 | { | ||
94 | 10 | float* bPtr = bVector; | |
95 | 10 | const float* aPtr = aVector; | |
96 | 10 | unsigned int number = 0; | |
97 | |||
98 |
2/2✓ Branch 0 taken 262190 times.
✓ Branch 1 taken 10 times.
|
262200 | for (number = 0; number < num_points; number++) |
99 | 262190 | *bPtr++ = log2f_non_ieee(*aPtr++); | |
100 | 10 | } | |
101 | #endif /* LV_HAVE_GENERIC */ | ||
102 | |||
103 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
104 | #include <immintrin.h> | ||
105 | |||
106 | #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) | ||
107 | #define POLY1_FMAAVX2(x, c0, c1) \ | ||
108 | _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) | ||
109 | #define POLY2_FMAAVX2(x, c0, c1, c2) \ | ||
110 | _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) | ||
111 | #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ | ||
112 | _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) | ||
113 | #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ | ||
114 | _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) | ||
115 | #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
116 | _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) | ||
117 | |||
118 | 4 | static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector, | |
119 | const float* aVector, | ||
120 | unsigned int num_points) | ||
121 | { | ||
122 | 4 | float* bPtr = bVector; | |
123 | 4 | const float* aPtr = aVector; | |
124 | |||
125 | 4 | unsigned int number = 0; | |
126 | 4 | const unsigned int eighthPoints = num_points / 8; | |
127 | |||
128 | __m256 aVal, bVal, mantissa, frac, leadingOne; | ||
129 | __m256i bias, exp; | ||
130 | |||
131 |
2/2✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 4 times.
|
65536 | for (; number < eighthPoints; number++) { |
132 | |||
133 | 65532 | aVal = _mm256_load_ps(aPtr); | |
134 | 65532 | bias = _mm256_set1_epi32(127); | |
135 | 65532 | leadingOne = _mm256_set1_ps(1.0f); | |
136 | 327660 | exp = _mm256_sub_epi32( | |
137 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
138 | _mm256_set1_epi32(0x7f800000)), | ||
139 | 23), | ||
140 | bias); | ||
141 | 65532 | bVal = _mm256_cvtepi32_ps(exp); | |
142 | |||
143 | // Now to extract mantissa | ||
144 | 262128 | frac = _mm256_or_ps( | |
145 | leadingOne, | ||
146 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
147 | |||
148 | #if LOG_POLY_DEGREE == 6 | ||
149 | 720852 | mantissa = POLY5_FMAAVX2(frac, | |
150 | 3.1157899f, | ||
151 | -3.3241990f, | ||
152 | 2.5988452f, | ||
153 | -1.2315303f, | ||
154 | 3.1821337e-1f, | ||
155 | -3.4436006e-2f); | ||
156 | #elif LOG_POLY_DEGREE == 5 | ||
157 | mantissa = POLY4_FMAAVX2(frac, | ||
158 | 2.8882704548164776201f, | ||
159 | -2.52074962577807006663f, | ||
160 | 1.48116647521213171641f, | ||
161 | -0.465725644288844778798f, | ||
162 | 0.0596515482674574969533f); | ||
163 | #elif LOG_POLY_DEGREE == 4 | ||
164 | mantissa = POLY3_FMAAVX2(frac, | ||
165 | 2.61761038894603480148f, | ||
166 | -1.75647175389045657003f, | ||
167 | 0.688243882994381274313f, | ||
168 | -0.107254423828329604454f); | ||
169 | #elif LOG_POLY_DEGREE == 3 | ||
170 | mantissa = POLY2_FMAAVX2(frac, | ||
171 | 2.28330284476918490682f, | ||
172 | -1.04913055217340124191f, | ||
173 | 0.204446009836232697516f); | ||
174 | #else | ||
175 | #error | ||
176 | #endif | ||
177 | |||
178 | 131064 | bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); | |
179 | _mm256_store_ps(bPtr, bVal); | ||
180 | |||
181 | 65532 | aPtr += 8; | |
182 | 65532 | bPtr += 8; | |
183 | } | ||
184 | |||
185 | 4 | number = eighthPoints * 8; | |
186 | 4 | volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); | |
187 | 4 | } | |
188 | |||
189 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ | ||
190 | |||
191 | #ifdef LV_HAVE_AVX2 | ||
192 | #include <immintrin.h> | ||
193 | |||
194 | #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) | ||
195 | #define POLY1_AVX2(x, c0, c1) \ | ||
196 | _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) | ||
197 | #define POLY2_AVX2(x, c0, c1, c2) \ | ||
198 | _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) | ||
199 | #define POLY3_AVX2(x, c0, c1, c2, c3) \ | ||
200 | _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) | ||
201 | #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ | ||
202 | _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) | ||
203 | #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
204 | _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) | ||
205 | |||
206 | static inline void | ||
207 | 2 | volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) | |
208 | { | ||
209 | 2 | float* bPtr = bVector; | |
210 | 2 | const float* aPtr = aVector; | |
211 | |||
212 | 2 | unsigned int number = 0; | |
213 | 2 | const unsigned int eighthPoints = num_points / 8; | |
214 | |||
215 | __m256 aVal, bVal, mantissa, frac, leadingOne; | ||
216 | __m256i bias, exp; | ||
217 | |||
218 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
219 | |||
220 | 32766 | aVal = _mm256_load_ps(aPtr); | |
221 | 32766 | bias = _mm256_set1_epi32(127); | |
222 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
223 | 163830 | exp = _mm256_sub_epi32( | |
224 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
225 | _mm256_set1_epi32(0x7f800000)), | ||
226 | 23), | ||
227 | bias); | ||
228 | 32766 | bVal = _mm256_cvtepi32_ps(exp); | |
229 | |||
230 | // Now to extract mantissa | ||
231 | 131064 | frac = _mm256_or_ps( | |
232 | leadingOne, | ||
233 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
234 | |||
235 | #if LOG_POLY_DEGREE == 6 | ||
236 | 524256 | mantissa = POLY5_AVX2(frac, | |
237 | 3.1157899f, | ||
238 | -3.3241990f, | ||
239 | 2.5988452f, | ||
240 | -1.2315303f, | ||
241 | 3.1821337e-1f, | ||
242 | -3.4436006e-2f); | ||
243 | #elif LOG_POLY_DEGREE == 5 | ||
244 | mantissa = POLY4_AVX2(frac, | ||
245 | 2.8882704548164776201f, | ||
246 | -2.52074962577807006663f, | ||
247 | 1.48116647521213171641f, | ||
248 | -0.465725644288844778798f, | ||
249 | 0.0596515482674574969533f); | ||
250 | #elif LOG_POLY_DEGREE == 4 | ||
251 | mantissa = POLY3_AVX2(frac, | ||
252 | 2.61761038894603480148f, | ||
253 | -1.75647175389045657003f, | ||
254 | 0.688243882994381274313f, | ||
255 | -0.107254423828329604454f); | ||
256 | #elif LOG_POLY_DEGREE == 3 | ||
257 | mantissa = POLY2_AVX2(frac, | ||
258 | 2.28330284476918490682f, | ||
259 | -1.04913055217340124191f, | ||
260 | 0.204446009836232697516f); | ||
261 | #else | ||
262 | #error | ||
263 | #endif | ||
264 | |||
265 | bVal = | ||
266 | 98298 | _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); | |
267 | _mm256_store_ps(bPtr, bVal); | ||
268 | |||
269 | 32766 | aPtr += 8; | |
270 | 32766 | bPtr += 8; | |
271 | } | ||
272 | |||
273 | 2 | number = eighthPoints * 8; | |
274 | 2 | volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); | |
275 | 2 | } | |
276 | |||
277 | #endif /* LV_HAVE_AVX2 for aligned */ | ||
278 | |||
279 | #ifdef LV_HAVE_SSE4_1 | ||
280 | #include <smmintrin.h> | ||
281 | |||
282 | #define POLY0(x, c0) _mm_set1_ps(c0) | ||
283 | #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) | ||
284 | #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) | ||
285 | #define POLY3(x, c0, c1, c2, c3) \ | ||
286 | _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) | ||
287 | #define POLY4(x, c0, c1, c2, c3, c4) \ | ||
288 | _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) | ||
289 | #define POLY5(x, c0, c1, c2, c3, c4, c5) \ | ||
290 | _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) | ||
291 | |||
292 | static inline void | ||
293 | 2 | volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) | |
294 | { | ||
295 | 2 | float* bPtr = bVector; | |
296 | 2 | const float* aPtr = aVector; | |
297 | |||
298 | 2 | unsigned int number = 0; | |
299 | 2 | const unsigned int quarterPoints = num_points / 4; | |
300 | |||
301 | __m128 aVal, bVal, mantissa, frac, leadingOne; | ||
302 | __m128i bias, exp; | ||
303 | |||
304 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
305 | |||
306 | 65534 | aVal = _mm_load_ps(aPtr); | |
307 | 65534 | bias = _mm_set1_epi32(127); | |
308 | 65534 | leadingOne = _mm_set1_ps(1.0f); | |
309 | 327670 | exp = _mm_sub_epi32( | |
310 | _mm_srli_epi32( | ||
311 | _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), | ||
312 | bias); | ||
313 | 65534 | bVal = _mm_cvtepi32_ps(exp); | |
314 | |||
315 | // Now to extract mantissa | ||
316 | 262136 | frac = _mm_or_ps(leadingOne, | |
317 | _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); | ||
318 | |||
319 | #if LOG_POLY_DEGREE == 6 | ||
320 | 1048544 | mantissa = POLY5(frac, | |
321 | 3.1157899f, | ||
322 | -3.3241990f, | ||
323 | 2.5988452f, | ||
324 | -1.2315303f, | ||
325 | 3.1821337e-1f, | ||
326 | -3.4436006e-2f); | ||
327 | #elif LOG_POLY_DEGREE == 5 | ||
328 | mantissa = POLY4(frac, | ||
329 | 2.8882704548164776201f, | ||
330 | -2.52074962577807006663f, | ||
331 | 1.48116647521213171641f, | ||
332 | -0.465725644288844778798f, | ||
333 | 0.0596515482674574969533f); | ||
334 | #elif LOG_POLY_DEGREE == 4 | ||
335 | mantissa = POLY3(frac, | ||
336 | 2.61761038894603480148f, | ||
337 | -1.75647175389045657003f, | ||
338 | 0.688243882994381274313f, | ||
339 | -0.107254423828329604454f); | ||
340 | #elif LOG_POLY_DEGREE == 3 | ||
341 | mantissa = POLY2(frac, | ||
342 | 2.28330284476918490682f, | ||
343 | -1.04913055217340124191f, | ||
344 | 0.204446009836232697516f); | ||
345 | #else | ||
346 | #error | ||
347 | #endif | ||
348 | |||
349 | 196602 | bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); | |
350 | _mm_store_ps(bPtr, bVal); | ||
351 | |||
352 | 65534 | aPtr += 4; | |
353 | 65534 | bPtr += 4; | |
354 | } | ||
355 | |||
356 | 2 | number = quarterPoints * 4; | |
357 | 2 | volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); | |
358 | 2 | } | |
359 | |||
360 | #endif /* LV_HAVE_SSE4_1 for aligned */ | ||
361 | |||
362 | #ifdef LV_HAVE_NEON | ||
363 | #include <arm_neon.h> | ||
364 | |||
365 | /* these macros allow us to embed logs in other kernels */ | ||
366 | #define VLOG2Q_NEON_PREAMBLE() \ | ||
367 | int32x4_t one = vdupq_n_s32(0x000800000); \ | ||
368 | /* minimax polynomial */ \ | ||
369 | float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ | ||
370 | float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ | ||
371 | float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ | ||
372 | float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ | ||
373 | float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ | ||
374 | float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ | ||
375 | float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ | ||
376 | int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ | ||
377 | int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ | ||
378 | int32x4_t exp_bias = vdupq_n_s32(127); | ||
379 | |||
380 | |||
381 | #define VLOG2Q_NEON_F32(log2_approx, aval) \ | ||
382 | int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ | ||
383 | int32x4_t significand_i = vandq_s32(aval, sig_mask); \ | ||
384 | exponent_i = vshrq_n_s32(exponent_i, 23); \ | ||
385 | \ | ||
386 | /* extract the exponent and significand \ | ||
387 | we can treat this as fixed point to save ~9% on the \ | ||
388 | conversion + float add */ \ | ||
389 | significand_i = vorrq_s32(one, significand_i); \ | ||
390 | float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \ | ||
391 | /* debias the exponent and convert to float */ \ | ||
392 | exponent_i = vsubq_s32(exponent_i, exp_bias); \ | ||
393 | float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ | ||
394 | \ | ||
395 | /* put the significand through a polynomial fit of log2(x) [1,2] \ | ||
396 | add the result to the exponent */ \ | ||
397 | log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ | ||
398 | float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ | ||
399 | log2_approx = vaddq_f32(log2_approx, tmp1); \ | ||
400 | float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ | ||
401 | tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ | ||
402 | log2_approx = vaddq_f32(log2_approx, tmp1); \ | ||
403 | \ | ||
404 | float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ | ||
405 | tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ | ||
406 | log2_approx = vaddq_f32(log2_approx, tmp1); \ | ||
407 | float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ | ||
408 | tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ | ||
409 | log2_approx = vaddq_f32(log2_approx, tmp1); \ | ||
410 | float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ | ||
411 | tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ | ||
412 | log2_approx = vaddq_f32(log2_approx, tmp1); \ | ||
413 | float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ | ||
414 | tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ | ||
415 | log2_approx = vaddq_f32(log2_approx, tmp1); | ||
416 | |||
417 | static inline void | ||
418 | volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points) | ||
419 | { | ||
420 | float* bPtr = bVector; | ||
421 | const float* aPtr = aVector; | ||
422 | unsigned int number; | ||
423 | const unsigned int quarterPoints = num_points / 4; | ||
424 | |||
425 | int32x4_t aval; | ||
426 | float32x4_t log2_approx; | ||
427 | |||
428 | VLOG2Q_NEON_PREAMBLE() | ||
429 | // lms | ||
430 | // p0 = vdupq_n_f32(-1.649132280361871); | ||
431 | // p1 = vdupq_n_f32(1.995047138579499); | ||
432 | // p2 = vdupq_n_f32(-0.336914839219728); | ||
433 | |||
434 | // keep in mind a single precision float is represented as | ||
435 | // (-1)^sign * 2^exp * 1.significand, so the log2 is | ||
436 | // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) | ||
437 | for (number = 0; number < quarterPoints; ++number) { | ||
438 | // load float in to an int register without conversion | ||
439 | aval = vld1q_s32((int*)aPtr); | ||
440 | |||
441 | VLOG2Q_NEON_F32(log2_approx, aval) | ||
442 | |||
443 | vst1q_f32(bPtr, log2_approx); | ||
444 | |||
445 | aPtr += 4; | ||
446 | bPtr += 4; | ||
447 | } | ||
448 | |||
449 | number = quarterPoints * 4; | ||
450 | volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); | ||
451 | } | ||
452 | |||
453 | #endif /* LV_HAVE_NEON */ | ||
454 | |||
455 | |||
456 | #endif /* INCLUDED_volk_32f_log2_32f_a_H */ | ||
457 | |||
458 | #ifndef INCLUDED_volk_32f_log2_32f_u_H | ||
459 | #define INCLUDED_volk_32f_log2_32f_u_H | ||
460 | |||
461 | |||
462 | #ifdef LV_HAVE_GENERIC | ||
463 | |||
464 | static inline void | ||
465 | 8 | volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) | |
466 | { | ||
467 | 8 | float* bPtr = bVector; | |
468 | 8 | const float* aPtr = aVector; | |
469 | 8 | unsigned int number = 0; | |
470 | |||
471 |
2/2✓ Branch 0 taken 262176 times.
✓ Branch 1 taken 8 times.
|
262184 | for (number = 0; number < num_points; number++) { |
472 | 262176 | float const result = log2f(*aPtr++); | |
473 |
1/2✓ Branch 0 taken 262176 times.
✗ Branch 1 not taken.
|
262176 | *bPtr++ = isinf(result) ? -127.0f : result; |
474 | } | ||
475 | 8 | } | |
476 | |||
477 | #endif /* LV_HAVE_GENERIC */ | ||
478 | |||
479 | |||
480 | #ifdef LV_HAVE_SSE4_1 | ||
481 | #include <smmintrin.h> | ||
482 | |||
483 | #define POLY0(x, c0) _mm_set1_ps(c0) | ||
484 | #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) | ||
485 | #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) | ||
486 | #define POLY3(x, c0, c1, c2, c3) \ | ||
487 | _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) | ||
488 | #define POLY4(x, c0, c1, c2, c3, c4) \ | ||
489 | _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) | ||
490 | #define POLY5(x, c0, c1, c2, c3, c4, c5) \ | ||
491 | _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) | ||
492 | |||
493 | static inline void | ||
494 | 2 | volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) | |
495 | { | ||
496 | 2 | float* bPtr = bVector; | |
497 | 2 | const float* aPtr = aVector; | |
498 | |||
499 | 2 | unsigned int number = 0; | |
500 | 2 | const unsigned int quarterPoints = num_points / 4; | |
501 | |||
502 | __m128 aVal, bVal, mantissa, frac, leadingOne; | ||
503 | __m128i bias, exp; | ||
504 | |||
505 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
506 | |||
507 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
508 | 65534 | bias = _mm_set1_epi32(127); | |
509 | 65534 | leadingOne = _mm_set1_ps(1.0f); | |
510 | 327670 | exp = _mm_sub_epi32( | |
511 | _mm_srli_epi32( | ||
512 | _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), | ||
513 | bias); | ||
514 | 65534 | bVal = _mm_cvtepi32_ps(exp); | |
515 | |||
516 | // Now to extract mantissa | ||
517 | 262136 | frac = _mm_or_ps(leadingOne, | |
518 | _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); | ||
519 | |||
520 | #if LOG_POLY_DEGREE == 6 | ||
521 | 1048544 | mantissa = POLY5(frac, | |
522 | 3.1157899f, | ||
523 | -3.3241990f, | ||
524 | 2.5988452f, | ||
525 | -1.2315303f, | ||
526 | 3.1821337e-1f, | ||
527 | -3.4436006e-2f); | ||
528 | #elif LOG_POLY_DEGREE == 5 | ||
529 | mantissa = POLY4(frac, | ||
530 | 2.8882704548164776201f, | ||
531 | -2.52074962577807006663f, | ||
532 | 1.48116647521213171641f, | ||
533 | -0.465725644288844778798f, | ||
534 | 0.0596515482674574969533f); | ||
535 | #elif LOG_POLY_DEGREE == 4 | ||
536 | mantissa = POLY3(frac, | ||
537 | 2.61761038894603480148f, | ||
538 | -1.75647175389045657003f, | ||
539 | 0.688243882994381274313f, | ||
540 | -0.107254423828329604454f); | ||
541 | #elif LOG_POLY_DEGREE == 3 | ||
542 | mantissa = POLY2(frac, | ||
543 | 2.28330284476918490682f, | ||
544 | -1.04913055217340124191f, | ||
545 | 0.204446009836232697516f); | ||
546 | #else | ||
547 | #error | ||
548 | #endif | ||
549 | |||
550 | 196602 | bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); | |
551 | _mm_storeu_ps(bPtr, bVal); | ||
552 | |||
553 | 65534 | aPtr += 4; | |
554 | 65534 | bPtr += 4; | |
555 | } | ||
556 | |||
557 | 2 | number = quarterPoints * 4; | |
558 | 2 | volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); | |
559 | 2 | } | |
560 | |||
561 | #endif /* LV_HAVE_SSE4_1 for unaligned */ | ||
562 | |||
563 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
564 | #include <immintrin.h> | ||
565 | |||
566 | #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) | ||
567 | #define POLY1_FMAAVX2(x, c0, c1) \ | ||
568 | _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) | ||
569 | #define POLY2_FMAAVX2(x, c0, c1, c2) \ | ||
570 | _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) | ||
571 | #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ | ||
572 | _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) | ||
573 | #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ | ||
574 | _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) | ||
575 | #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
576 | _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) | ||
577 | |||
578 | 2 | static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector, | |
579 | const float* aVector, | ||
580 | unsigned int num_points) | ||
581 | { | ||
582 | 2 | float* bPtr = bVector; | |
583 | 2 | const float* aPtr = aVector; | |
584 | |||
585 | 2 | unsigned int number = 0; | |
586 | 2 | const unsigned int eighthPoints = num_points / 8; | |
587 | |||
588 | __m256 aVal, bVal, mantissa, frac, leadingOne; | ||
589 | __m256i bias, exp; | ||
590 | |||
591 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
592 | |||
593 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
594 | 32766 | bias = _mm256_set1_epi32(127); | |
595 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
596 | 163830 | exp = _mm256_sub_epi32( | |
597 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
598 | _mm256_set1_epi32(0x7f800000)), | ||
599 | 23), | ||
600 | bias); | ||
601 | 32766 | bVal = _mm256_cvtepi32_ps(exp); | |
602 | |||
603 | // Now to extract mantissa | ||
604 | 131064 | frac = _mm256_or_ps( | |
605 | leadingOne, | ||
606 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
607 | |||
608 | #if LOG_POLY_DEGREE == 6 | ||
609 | 360426 | mantissa = POLY5_FMAAVX2(frac, | |
610 | 3.1157899f, | ||
611 | -3.3241990f, | ||
612 | 2.5988452f, | ||
613 | -1.2315303f, | ||
614 | 3.1821337e-1f, | ||
615 | -3.4436006e-2f); | ||
616 | #elif LOG_POLY_DEGREE == 5 | ||
617 | mantissa = POLY4_FMAAVX2(frac, | ||
618 | 2.8882704548164776201f, | ||
619 | -2.52074962577807006663f, | ||
620 | 1.48116647521213171641f, | ||
621 | -0.465725644288844778798f, | ||
622 | 0.0596515482674574969533f); | ||
623 | #elif LOG_POLY_DEGREE == 4 | ||
624 | mantissa = POLY3_FMAAVX2(frac, | ||
625 | 2.61761038894603480148f, | ||
626 | -1.75647175389045657003f, | ||
627 | 0.688243882994381274313f, | ||
628 | -0.107254423828329604454f); | ||
629 | #elif LOG_POLY_DEGREE == 3 | ||
630 | mantissa = POLY2_FMAAVX2(frac, | ||
631 | 2.28330284476918490682f, | ||
632 | -1.04913055217340124191f, | ||
633 | 0.204446009836232697516f); | ||
634 | #else | ||
635 | #error | ||
636 | #endif | ||
637 | |||
638 | 65532 | bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); | |
639 | _mm256_storeu_ps(bPtr, bVal); | ||
640 | |||
641 | 32766 | aPtr += 8; | |
642 | 32766 | bPtr += 8; | |
643 | } | ||
644 | |||
645 | 2 | number = eighthPoints * 8; | |
646 | 2 | volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); | |
647 | 2 | } | |
648 | |||
649 | #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ | ||
650 | |||
651 | #ifdef LV_HAVE_AVX2 | ||
652 | #include <immintrin.h> | ||
653 | |||
654 | #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) | ||
655 | #define POLY1_AVX2(x, c0, c1) \ | ||
656 | _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) | ||
657 | #define POLY2_AVX2(x, c0, c1, c2) \ | ||
658 | _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) | ||
659 | #define POLY3_AVX2(x, c0, c1, c2, c3) \ | ||
660 | _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) | ||
661 | #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ | ||
662 | _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) | ||
663 | #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ | ||
664 | _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) | ||
665 | |||
666 | static inline void | ||
667 | 2 | volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) | |
668 | { | ||
669 | 2 | float* bPtr = bVector; | |
670 | 2 | const float* aPtr = aVector; | |
671 | |||
672 | 2 | unsigned int number = 0; | |
673 | 2 | const unsigned int eighthPoints = num_points / 8; | |
674 | |||
675 | __m256 aVal, bVal, mantissa, frac, leadingOne; | ||
676 | __m256i bias, exp; | ||
677 | |||
678 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
679 | |||
680 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
681 | 32766 | bias = _mm256_set1_epi32(127); | |
682 | 32766 | leadingOne = _mm256_set1_ps(1.0f); | |
683 | 163830 | exp = _mm256_sub_epi32( | |
684 | _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), | ||
685 | _mm256_set1_epi32(0x7f800000)), | ||
686 | 23), | ||
687 | bias); | ||
688 | 32766 | bVal = _mm256_cvtepi32_ps(exp); | |
689 | |||
690 | // Now to extract mantissa | ||
691 | 131064 | frac = _mm256_or_ps( | |
692 | leadingOne, | ||
693 | _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); | ||
694 | |||
695 | #if LOG_POLY_DEGREE == 6 | ||
696 | 524256 | mantissa = POLY5_AVX2(frac, | |
697 | 3.1157899f, | ||
698 | -3.3241990f, | ||
699 | 2.5988452f, | ||
700 | -1.2315303f, | ||
701 | 3.1821337e-1f, | ||
702 | -3.4436006e-2f); | ||
703 | #elif LOG_POLY_DEGREE == 5 | ||
704 | mantissa = POLY4_AVX2(frac, | ||
705 | 2.8882704548164776201f, | ||
706 | -2.52074962577807006663f, | ||
707 | 1.48116647521213171641f, | ||
708 | -0.465725644288844778798f, | ||
709 | 0.0596515482674574969533f); | ||
710 | #elif LOG_POLY_DEGREE == 4 | ||
711 | mantissa = POLY3_AVX2(frac, | ||
712 | 2.61761038894603480148f, | ||
713 | -1.75647175389045657003f, | ||
714 | 0.688243882994381274313f, | ||
715 | -0.107254423828329604454f); | ||
716 | #elif LOG_POLY_DEGREE == 3 | ||
717 | mantissa = POLY2_AVX2(frac, | ||
718 | 2.28330284476918490682f, | ||
719 | -1.04913055217340124191f, | ||
720 | 0.204446009836232697516f); | ||
721 | #else | ||
722 | #error | ||
723 | #endif | ||
724 | |||
725 | bVal = | ||
726 | 98298 | _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); | |
727 | _mm256_storeu_ps(bPtr, bVal); | ||
728 | |||
729 | 32766 | aPtr += 8; | |
730 | 32766 | bPtr += 8; | |
731 | } | ||
732 | |||
733 | 2 | number = eighthPoints * 8; | |
734 | 2 | volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); | |
735 | 2 | } | |
736 | |||
737 | #endif /* LV_HAVE_AVX2 for unaligned */ | ||
738 | |||
739 | |||
740 | #endif /* INCLUDED_volk_32f_log2_32f_u_H */ | ||
741 |