GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_log2_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 129 129 100.0%
Functions: 8 8 100.0%
Branches: 17 18 94.4%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_log2_32f
12 *
13 * \b Overview
14 *
15 * Computes base 2 log of input vector and stores results in output vector.
16 *
17 * Note that this implementation is not conforming to the IEEE FP standard, i.e.,
18 * +-Inf outputs are mapped to +-127.0f and +-NaN input values are not supported.
19 *
20 * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation
21 * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm
22 *
23 * Permission is hereby granted, free of charge, to any person obtaining a
24 * copy of this software and associated documentation files (the
25 * "Software"), to deal in the Software without restriction, including
26 * without limitation the rights to use, copy, modify, merge, publish,
27 * distribute, sub license, and/or sell copies of the Software, and to
28 * permit persons to whom the Software is furnished to do so, subject to
29 * the following conditions:
30 *
31 * The above copyright notice and this permission notice (including the
32 * next paragraph) shall be included in all copies or substantial portions
33 * of the Software.
34 *
35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
36 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
38 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
39 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
40 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
41 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
42 *
43 * This is the MIT License (MIT)
44 *
45 * <b>Dispatcher Prototype</b>
46 * \code
47 * void volk_32f_log2_32f(float* bVector, const float* aVector, unsigned int num_points)
48 * \endcode
49 *
50 * \b Inputs
51 * \li aVector: the input vector of floats.
52 * \li num_points: The number of data points.
53 *
54 * \b Outputs
55 * \li bVector: The output vector.
56 *
57 * \b Example
58 * \code
59 * int N = 10;
60 * unsigned int alignment = volk_get_alignment();
61 * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
62 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
63 *
64 * for(unsigned int ii = 0; ii < N; ++ii){
65 * in[ii] = std::pow(2.f,((float)ii));
66 * }
67 *
68 * volk_32f_log2_32f(out, in, N);
69 *
70 * for(unsigned int ii = 0; ii < N; ++ii){
71 * printf("out(%i) = %f\n", ii, out[ii]);
72 * }
73 *
74 * volk_free(in);
75 * volk_free(out);
76 * \endcode
77 */
78
79 #ifndef INCLUDED_volk_32f_log2_32f_a_H
80 #define INCLUDED_volk_32f_log2_32f_a_H
81
82 #include <inttypes.h>
83 #include <math.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86
87 #define LOG_POLY_DEGREE 6
88
89 #ifdef LV_HAVE_GENERIC
90
91 static inline void
92 10 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
93 {
94 10 float* bPtr = bVector;
95 10 const float* aPtr = aVector;
96 10 unsigned int number = 0;
97
98
2/2
✓ Branch 0 taken 262190 times.
✓ Branch 1 taken 10 times.
262200 for (number = 0; number < num_points; number++)
99 262190 *bPtr++ = log2f_non_ieee(*aPtr++);
100 10 }
101 #endif /* LV_HAVE_GENERIC */
102
103 #if LV_HAVE_AVX2 && LV_HAVE_FMA
104 #include <immintrin.h>
105
106 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107 #define POLY1_FMAAVX2(x, c0, c1) \
108 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109 #define POLY2_FMAAVX2(x, c0, c1, c2) \
110 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
117
118 4 static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
119 const float* aVector,
120 unsigned int num_points)
121 {
122 4 float* bPtr = bVector;
123 4 const float* aPtr = aVector;
124
125 4 unsigned int number = 0;
126 4 const unsigned int eighthPoints = num_points / 8;
127
128 __m256 aVal, bVal, mantissa, frac, leadingOne;
129 __m256i bias, exp;
130
131
2/2
✓ Branch 0 taken 65532 times.
✓ Branch 1 taken 4 times.
65536 for (; number < eighthPoints; number++) {
132
133 65532 aVal = _mm256_load_ps(aPtr);
134 65532 bias = _mm256_set1_epi32(127);
135 65532 leadingOne = _mm256_set1_ps(1.0f);
136 327660 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
139 23),
140 bias);
141 65532 bVal = _mm256_cvtepi32_ps(exp);
142
143 // Now to extract mantissa
144 262128 frac = _mm256_or_ps(
145 leadingOne,
146 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
147
148 #if LOG_POLY_DEGREE == 6
149 720852 mantissa = POLY5_FMAAVX2(frac,
150 3.1157899f,
151 -3.3241990f,
152 2.5988452f,
153 -1.2315303f,
154 3.1821337e-1f,
155 -3.4436006e-2f);
156 #elif LOG_POLY_DEGREE == 5
157 mantissa = POLY4_FMAAVX2(frac,
158 2.8882704548164776201f,
159 -2.52074962577807006663f,
160 1.48116647521213171641f,
161 -0.465725644288844778798f,
162 0.0596515482674574969533f);
163 #elif LOG_POLY_DEGREE == 4
164 mantissa = POLY3_FMAAVX2(frac,
165 2.61761038894603480148f,
166 -1.75647175389045657003f,
167 0.688243882994381274313f,
168 -0.107254423828329604454f);
169 #elif LOG_POLY_DEGREE == 3
170 mantissa = POLY2_FMAAVX2(frac,
171 2.28330284476918490682f,
172 -1.04913055217340124191f,
173 0.204446009836232697516f);
174 #else
175 #error
176 #endif
177
178 131064 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179 _mm256_store_ps(bPtr, bVal);
180
181 65532 aPtr += 8;
182 65532 bPtr += 8;
183 }
184
185 4 number = eighthPoints * 8;
186 4 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
187 4 }
188
189 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
190
191 #ifdef LV_HAVE_AVX2
192 #include <immintrin.h>
193
194 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195 #define POLY1_AVX2(x, c0, c1) \
196 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197 #define POLY2_AVX2(x, c0, c1, c2) \
198 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199 #define POLY3_AVX2(x, c0, c1, c2, c3) \
200 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
205
206 static inline void
207 2 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
208 {
209 2 float* bPtr = bVector;
210 2 const float* aPtr = aVector;
211
212 2 unsigned int number = 0;
213 2 const unsigned int eighthPoints = num_points / 8;
214
215 __m256 aVal, bVal, mantissa, frac, leadingOne;
216 __m256i bias, exp;
217
218
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
219
220 32766 aVal = _mm256_load_ps(aPtr);
221 32766 bias = _mm256_set1_epi32(127);
222 32766 leadingOne = _mm256_set1_ps(1.0f);
223 163830 exp = _mm256_sub_epi32(
224 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225 _mm256_set1_epi32(0x7f800000)),
226 23),
227 bias);
228 32766 bVal = _mm256_cvtepi32_ps(exp);
229
230 // Now to extract mantissa
231 131064 frac = _mm256_or_ps(
232 leadingOne,
233 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
234
235 #if LOG_POLY_DEGREE == 6
236 524256 mantissa = POLY5_AVX2(frac,
237 3.1157899f,
238 -3.3241990f,
239 2.5988452f,
240 -1.2315303f,
241 3.1821337e-1f,
242 -3.4436006e-2f);
243 #elif LOG_POLY_DEGREE == 5
244 mantissa = POLY4_AVX2(frac,
245 2.8882704548164776201f,
246 -2.52074962577807006663f,
247 1.48116647521213171641f,
248 -0.465725644288844778798f,
249 0.0596515482674574969533f);
250 #elif LOG_POLY_DEGREE == 4
251 mantissa = POLY3_AVX2(frac,
252 2.61761038894603480148f,
253 -1.75647175389045657003f,
254 0.688243882994381274313f,
255 -0.107254423828329604454f);
256 #elif LOG_POLY_DEGREE == 3
257 mantissa = POLY2_AVX2(frac,
258 2.28330284476918490682f,
259 -1.04913055217340124191f,
260 0.204446009836232697516f);
261 #else
262 #error
263 #endif
264
265 bVal =
266 98298 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267 _mm256_store_ps(bPtr, bVal);
268
269 32766 aPtr += 8;
270 32766 bPtr += 8;
271 }
272
273 2 number = eighthPoints * 8;
274 2 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
275 2 }
276
277 #endif /* LV_HAVE_AVX2 for aligned */
278
279 #ifdef LV_HAVE_SSE4_1
280 #include <smmintrin.h>
281
282 #define POLY0(x, c0) _mm_set1_ps(c0)
283 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285 #define POLY3(x, c0, c1, c2, c3) \
286 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287 #define POLY4(x, c0, c1, c2, c3, c4) \
288 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
290 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
291
292 static inline void
293 2 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
294 {
295 2 float* bPtr = bVector;
296 2 const float* aPtr = aVector;
297
298 2 unsigned int number = 0;
299 2 const unsigned int quarterPoints = num_points / 4;
300
301 __m128 aVal, bVal, mantissa, frac, leadingOne;
302 __m128i bias, exp;
303
304
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
305
306 65534 aVal = _mm_load_ps(aPtr);
307 65534 bias = _mm_set1_epi32(127);
308 65534 leadingOne = _mm_set1_ps(1.0f);
309 327670 exp = _mm_sub_epi32(
310 _mm_srli_epi32(
311 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
312 bias);
313 65534 bVal = _mm_cvtepi32_ps(exp);
314
315 // Now to extract mantissa
316 262136 frac = _mm_or_ps(leadingOne,
317 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
318
319 #if LOG_POLY_DEGREE == 6
320 1048544 mantissa = POLY5(frac,
321 3.1157899f,
322 -3.3241990f,
323 2.5988452f,
324 -1.2315303f,
325 3.1821337e-1f,
326 -3.4436006e-2f);
327 #elif LOG_POLY_DEGREE == 5
328 mantissa = POLY4(frac,
329 2.8882704548164776201f,
330 -2.52074962577807006663f,
331 1.48116647521213171641f,
332 -0.465725644288844778798f,
333 0.0596515482674574969533f);
334 #elif LOG_POLY_DEGREE == 4
335 mantissa = POLY3(frac,
336 2.61761038894603480148f,
337 -1.75647175389045657003f,
338 0.688243882994381274313f,
339 -0.107254423828329604454f);
340 #elif LOG_POLY_DEGREE == 3
341 mantissa = POLY2(frac,
342 2.28330284476918490682f,
343 -1.04913055217340124191f,
344 0.204446009836232697516f);
345 #else
346 #error
347 #endif
348
349 196602 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
350 _mm_store_ps(bPtr, bVal);
351
352 65534 aPtr += 4;
353 65534 bPtr += 4;
354 }
355
356 2 number = quarterPoints * 4;
357 2 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
358 2 }
359
360 #endif /* LV_HAVE_SSE4_1 for aligned */
361
362 #ifdef LV_HAVE_NEON
363 #include <arm_neon.h>
364
365 /* these macros allow us to embed logs in other kernels */
366 #define VLOG2Q_NEON_PREAMBLE() \
367 int32x4_t one = vdupq_n_s32(0x000800000); \
368 /* minimax polynomial */ \
369 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378 int32x4_t exp_bias = vdupq_n_s32(127);
379
380
381 #define VLOG2Q_NEON_F32(log2_approx, aval) \
382 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384 exponent_i = vshrq_n_s32(exponent_i, 23); \
385 \
386 /* extract the exponent and significand \
387 we can treat this as fixed point to save ~9% on the \
388 conversion + float add */ \
389 significand_i = vorrq_s32(one, significand_i); \
390 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
391 /* debias the exponent and convert to float */ \
392 exponent_i = vsubq_s32(exponent_i, exp_bias); \
393 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
394 \
395 /* put the significand through a polynomial fit of log2(x) [1,2] \
396 add the result to the exponent */ \
397 log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
398 float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
399 log2_approx = vaddq_f32(log2_approx, tmp1); \
400 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
401 tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
402 log2_approx = vaddq_f32(log2_approx, tmp1); \
403 \
404 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
405 tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
406 log2_approx = vaddq_f32(log2_approx, tmp1); \
407 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
408 tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
409 log2_approx = vaddq_f32(log2_approx, tmp1); \
410 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
411 tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
414 tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
415 log2_approx = vaddq_f32(log2_approx, tmp1);
416
417 static inline void
418 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
419 {
420 float* bPtr = bVector;
421 const float* aPtr = aVector;
422 unsigned int number;
423 const unsigned int quarterPoints = num_points / 4;
424
425 int32x4_t aval;
426 float32x4_t log2_approx;
427
428 VLOG2Q_NEON_PREAMBLE()
429 // lms
430 // p0 = vdupq_n_f32(-1.649132280361871);
431 // p1 = vdupq_n_f32(1.995047138579499);
432 // p2 = vdupq_n_f32(-0.336914839219728);
433
434 // keep in mind a single precision float is represented as
435 // (-1)^sign * 2^exp * 1.significand, so the log2 is
436 // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
437 for (number = 0; number < quarterPoints; ++number) {
438 // load float in to an int register without conversion
439 aval = vld1q_s32((int*)aPtr);
440
441 VLOG2Q_NEON_F32(log2_approx, aval)
442
443 vst1q_f32(bPtr, log2_approx);
444
445 aPtr += 4;
446 bPtr += 4;
447 }
448
449 number = quarterPoints * 4;
450 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
451 }
452
453 #endif /* LV_HAVE_NEON */
454
455
456 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
457
458 #ifndef INCLUDED_volk_32f_log2_32f_u_H
459 #define INCLUDED_volk_32f_log2_32f_u_H
460
461
462 #ifdef LV_HAVE_GENERIC
463
464 static inline void
465 8 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
466 {
467 8 float* bPtr = bVector;
468 8 const float* aPtr = aVector;
469 8 unsigned int number = 0;
470
471
2/2
✓ Branch 0 taken 262176 times.
✓ Branch 1 taken 8 times.
262184 for (number = 0; number < num_points; number++) {
472 262176 float const result = log2f(*aPtr++);
473
1/2
✓ Branch 0 taken 262176 times.
✗ Branch 1 not taken.
262176 *bPtr++ = isinf(result) ? -127.0f : result;
474 }
475 8 }
476
477 #endif /* LV_HAVE_GENERIC */
478
479
480 #ifdef LV_HAVE_SSE4_1
481 #include <smmintrin.h>
482
483 #define POLY0(x, c0) _mm_set1_ps(c0)
484 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
485 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
486 #define POLY3(x, c0, c1, c2, c3) \
487 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
488 #define POLY4(x, c0, c1, c2, c3, c4) \
489 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
490 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
491 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
492
493 static inline void
494 2 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
495 {
496 2 float* bPtr = bVector;
497 2 const float* aPtr = aVector;
498
499 2 unsigned int number = 0;
500 2 const unsigned int quarterPoints = num_points / 4;
501
502 __m128 aVal, bVal, mantissa, frac, leadingOne;
503 __m128i bias, exp;
504
505
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
506
507 65534 aVal = _mm_loadu_ps(aPtr);
508 65534 bias = _mm_set1_epi32(127);
509 65534 leadingOne = _mm_set1_ps(1.0f);
510 327670 exp = _mm_sub_epi32(
511 _mm_srli_epi32(
512 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
513 bias);
514 65534 bVal = _mm_cvtepi32_ps(exp);
515
516 // Now to extract mantissa
517 262136 frac = _mm_or_ps(leadingOne,
518 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
519
520 #if LOG_POLY_DEGREE == 6
521 1048544 mantissa = POLY5(frac,
522 3.1157899f,
523 -3.3241990f,
524 2.5988452f,
525 -1.2315303f,
526 3.1821337e-1f,
527 -3.4436006e-2f);
528 #elif LOG_POLY_DEGREE == 5
529 mantissa = POLY4(frac,
530 2.8882704548164776201f,
531 -2.52074962577807006663f,
532 1.48116647521213171641f,
533 -0.465725644288844778798f,
534 0.0596515482674574969533f);
535 #elif LOG_POLY_DEGREE == 4
536 mantissa = POLY3(frac,
537 2.61761038894603480148f,
538 -1.75647175389045657003f,
539 0.688243882994381274313f,
540 -0.107254423828329604454f);
541 #elif LOG_POLY_DEGREE == 3
542 mantissa = POLY2(frac,
543 2.28330284476918490682f,
544 -1.04913055217340124191f,
545 0.204446009836232697516f);
546 #else
547 #error
548 #endif
549
550 196602 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
551 _mm_storeu_ps(bPtr, bVal);
552
553 65534 aPtr += 4;
554 65534 bPtr += 4;
555 }
556
557 2 number = quarterPoints * 4;
558 2 volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
559 2 }
560
561 #endif /* LV_HAVE_SSE4_1 for unaligned */
562
563 #if LV_HAVE_AVX2 && LV_HAVE_FMA
564 #include <immintrin.h>
565
566 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
567 #define POLY1_FMAAVX2(x, c0, c1) \
568 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
569 #define POLY2_FMAAVX2(x, c0, c1, c2) \
570 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
571 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
572 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
573 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
574 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
575 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
576 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
577
578 2 static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
579 const float* aVector,
580 unsigned int num_points)
581 {
582 2 float* bPtr = bVector;
583 2 const float* aPtr = aVector;
584
585 2 unsigned int number = 0;
586 2 const unsigned int eighthPoints = num_points / 8;
587
588 __m256 aVal, bVal, mantissa, frac, leadingOne;
589 __m256i bias, exp;
590
591
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
592
593 32766 aVal = _mm256_loadu_ps(aPtr);
594 32766 bias = _mm256_set1_epi32(127);
595 32766 leadingOne = _mm256_set1_ps(1.0f);
596 163830 exp = _mm256_sub_epi32(
597 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
598 _mm256_set1_epi32(0x7f800000)),
599 23),
600 bias);
601 32766 bVal = _mm256_cvtepi32_ps(exp);
602
603 // Now to extract mantissa
604 131064 frac = _mm256_or_ps(
605 leadingOne,
606 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
607
608 #if LOG_POLY_DEGREE == 6
609 360426 mantissa = POLY5_FMAAVX2(frac,
610 3.1157899f,
611 -3.3241990f,
612 2.5988452f,
613 -1.2315303f,
614 3.1821337e-1f,
615 -3.4436006e-2f);
616 #elif LOG_POLY_DEGREE == 5
617 mantissa = POLY4_FMAAVX2(frac,
618 2.8882704548164776201f,
619 -2.52074962577807006663f,
620 1.48116647521213171641f,
621 -0.465725644288844778798f,
622 0.0596515482674574969533f);
623 #elif LOG_POLY_DEGREE == 4
624 mantissa = POLY3_FMAAVX2(frac,
625 2.61761038894603480148f,
626 -1.75647175389045657003f,
627 0.688243882994381274313f,
628 -0.107254423828329604454f);
629 #elif LOG_POLY_DEGREE == 3
630 mantissa = POLY2_FMAAVX2(frac,
631 2.28330284476918490682f,
632 -1.04913055217340124191f,
633 0.204446009836232697516f);
634 #else
635 #error
636 #endif
637
638 65532 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
639 _mm256_storeu_ps(bPtr, bVal);
640
641 32766 aPtr += 8;
642 32766 bPtr += 8;
643 }
644
645 2 number = eighthPoints * 8;
646 2 volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
647 2 }
648
649 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
650
651 #ifdef LV_HAVE_AVX2
652 #include <immintrin.h>
653
654 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
655 #define POLY1_AVX2(x, c0, c1) \
656 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
657 #define POLY2_AVX2(x, c0, c1, c2) \
658 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
659 #define POLY3_AVX2(x, c0, c1, c2, c3) \
660 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
661 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
662 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
663 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
664 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
665
666 static inline void
667 2 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
668 {
669 2 float* bPtr = bVector;
670 2 const float* aPtr = aVector;
671
672 2 unsigned int number = 0;
673 2 const unsigned int eighthPoints = num_points / 8;
674
675 __m256 aVal, bVal, mantissa, frac, leadingOne;
676 __m256i bias, exp;
677
678
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
679
680 32766 aVal = _mm256_loadu_ps(aPtr);
681 32766 bias = _mm256_set1_epi32(127);
682 32766 leadingOne = _mm256_set1_ps(1.0f);
683 163830 exp = _mm256_sub_epi32(
684 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
685 _mm256_set1_epi32(0x7f800000)),
686 23),
687 bias);
688 32766 bVal = _mm256_cvtepi32_ps(exp);
689
690 // Now to extract mantissa
691 131064 frac = _mm256_or_ps(
692 leadingOne,
693 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
694
695 #if LOG_POLY_DEGREE == 6
696 524256 mantissa = POLY5_AVX2(frac,
697 3.1157899f,
698 -3.3241990f,
699 2.5988452f,
700 -1.2315303f,
701 3.1821337e-1f,
702 -3.4436006e-2f);
703 #elif LOG_POLY_DEGREE == 5
704 mantissa = POLY4_AVX2(frac,
705 2.8882704548164776201f,
706 -2.52074962577807006663f,
707 1.48116647521213171641f,
708 -0.465725644288844778798f,
709 0.0596515482674574969533f);
710 #elif LOG_POLY_DEGREE == 4
711 mantissa = POLY3_AVX2(frac,
712 2.61761038894603480148f,
713 -1.75647175389045657003f,
714 0.688243882994381274313f,
715 -0.107254423828329604454f);
716 #elif LOG_POLY_DEGREE == 3
717 mantissa = POLY2_AVX2(frac,
718 2.28330284476918490682f,
719 -1.04913055217340124191f,
720 0.204446009836232697516f);
721 #else
722 #error
723 #endif
724
725 bVal =
726 98298 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
727 _mm256_storeu_ps(bPtr, bVal);
728
729 32766 aPtr += 8;
730 32766 bPtr += 8;
731 }
732
733 2 number = eighthPoints * 8;
734 2 volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
735 2 }
736
737 #endif /* LV_HAVE_AVX2 for unaligned */
738
739
740 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
741