GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_pow_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 360 360 100.0%
Functions: 7 7 100.0%
Branches: 26 26 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_pow_32f
12 *
13 * \b Overview
14 *
15 * Raises the sample in aVector to the power of the number in bVector.
16 *
17 * c[i] = pow(a[i], b[i])
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector,
22 * unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li bVector: The input vector of indices (power values).
26 * \li aVector: The input vector of base values.
27 * \li num_points: The number of values in both input vectors.
28 *
29 * \b Outputs
30 * \li cVector: The output vector.
31 *
32 * \b Example
33 * Calculate the first two powers of two (2^x).
34 * \code
35 * int N = 10;
36 * unsigned int alignment = volk_get_alignment();
37 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
38 * float* twos = (float*)volk_malloc(sizeof(float)*N, alignment);
39 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
40 *
41 * for(unsigned int ii = 0; ii < N; ++ii){
42 * increasing[ii] = (float)ii;
43 * twos[ii] = 2.f;
44 * }
45 *
46 * volk_32f_x2_pow_32f(out, increasing, twos, N);
47 *
48 * for(unsigned int ii = 0; ii < N; ++ii){
49 * printf("out[%u] = %1.2f\n", ii, out[ii]);
50 * }
51 *
52 * volk_free(increasing);
53 * volk_free(twos);
54 * volk_free(out);
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
59 #define INCLUDED_volk_32f_x2_pow_32f_a_H
60
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64 #include <stdlib.h>
65
66 #define POW_POLY_DEGREE 3
67
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70
71 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
72 #define POLY1_AVX2_FMA(x, c0, c1) \
73 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
74 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
75 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
76 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
77 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
78 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
79 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
80 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
81 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
82
83 2 static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
84 const float* bVector,
85 const float* aVector,
86 unsigned int num_points)
87 {
88 2 float* cPtr = cVector;
89 2 const float* bPtr = bVector;
90 2 const float* aPtr = aVector;
91
92 2 unsigned int number = 0;
93 2 const unsigned int eighthPoints = num_points / 8;
94
95 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
96 __m256 tmp, fx, mask, pow2n, z, y;
97 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
98 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
99 __m256i bias, exp, emm0, pi32_0x7f;
100
101 2 one = _mm256_set1_ps(1.0);
102 2 exp_hi = _mm256_set1_ps(88.3762626647949);
103 2 exp_lo = _mm256_set1_ps(-88.3762626647949);
104 2 ln2 = _mm256_set1_ps(0.6931471805);
105 2 log2EF = _mm256_set1_ps(1.44269504088896341);
106 2 half = _mm256_set1_ps(0.5);
107 2 exp_C1 = _mm256_set1_ps(0.693359375);
108 2 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
109 2 pi32_0x7f = _mm256_set1_epi32(0x7f);
110
111 2 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
112 2 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
113 2 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
114 2 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
115 2 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
116 2 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
117
118
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
119 // First compute the logarithm
120 32766 aVal = _mm256_load_ps(aPtr);
121 32766 bias = _mm256_set1_epi32(127);
122 32766 leadingOne = _mm256_set1_ps(1.0f);
123 163830 exp = _mm256_sub_epi32(
124 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
125 _mm256_set1_epi32(0x7f800000)),
126 23),
127 bias);
128 32766 logarithm = _mm256_cvtepi32_ps(exp);
129
130 131064 frac = _mm256_or_ps(
131 leadingOne,
132 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
133
134 #if POW_POLY_DEGREE == 6
135 mantissa = POLY5_AVX2_FMA(frac,
136 3.1157899f,
137 -3.3241990f,
138 2.5988452f,
139 -1.2315303f,
140 3.1821337e-1f,
141 -3.4436006e-2f);
142 #elif POW_POLY_DEGREE == 5
143 mantissa = POLY4_AVX2_FMA(frac,
144 2.8882704548164776201f,
145 -2.52074962577807006663f,
146 1.48116647521213171641f,
147 -0.465725644288844778798f,
148 0.0596515482674574969533f);
149 #elif POW_POLY_DEGREE == 4
150 mantissa = POLY3_AVX2_FMA(frac,
151 2.61761038894603480148f,
152 -1.75647175389045657003f,
153 0.688243882994381274313f,
154 -0.107254423828329604454f);
155 #elif POW_POLY_DEGREE == 3
156 163830 mantissa = POLY2_AVX2_FMA(frac,
157 2.28330284476918490682f,
158 -1.04913055217340124191f,
159 0.204446009836232697516f);
160 #else
161 #error
162 #endif
163
164 65532 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
165 32766 logarithm = _mm256_mul_ps(logarithm, ln2);
166
167 // Now calculate b*lna
168 32766 bVal = _mm256_load_ps(bPtr);
169 32766 bVal = _mm256_mul_ps(bVal, logarithm);
170
171 // Now compute exp(b*lna)
172 65532 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
173
174 32766 fx = _mm256_fmadd_ps(bVal, log2EF, half);
175
176 32766 emm0 = _mm256_cvttps_epi32(fx);
177 32766 tmp = _mm256_cvtepi32_ps(emm0);
178
179 65532 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
180 32766 fx = _mm256_sub_ps(tmp, mask);
181
182 32766 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
183 32766 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
184 32766 z = _mm256_mul_ps(bVal, bVal);
185
186 32766 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
187 32766 y = _mm256_fmadd_ps(y, bVal, exp_p2);
188 32766 y = _mm256_fmadd_ps(y, bVal, exp_p3);
189 32766 y = _mm256_fmadd_ps(y, bVal, exp_p4);
190 32766 y = _mm256_fmadd_ps(y, bVal, exp_p5);
191 32766 y = _mm256_fmadd_ps(y, z, bVal);
192 32766 y = _mm256_add_ps(y, one);
193
194 emm0 =
195 98298 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
196
197 32766 pow2n = _mm256_castsi256_ps(emm0);
198 32766 cVal = _mm256_mul_ps(y, pow2n);
199
200 _mm256_store_ps(cPtr, cVal);
201
202 32766 aPtr += 8;
203 32766 bPtr += 8;
204 32766 cPtr += 8;
205 }
206
207 2 number = eighthPoints * 8;
208
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
209 14 *cPtr++ = pow(*aPtr++, *bPtr++);
210 }
211 2 }
212
213 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
214
215 #ifdef LV_HAVE_AVX2
216 #include <immintrin.h>
217
218 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
219 #define POLY1_AVX2(x, c0, c1) \
220 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
221 #define POLY2_AVX2(x, c0, c1, c2) \
222 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
223 #define POLY3_AVX2(x, c0, c1, c2, c3) \
224 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
225 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
226 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
227 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
228 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
229
230 2 static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
231 const float* bVector,
232 const float* aVector,
233 unsigned int num_points)
234 {
235 2 float* cPtr = cVector;
236 2 const float* bPtr = bVector;
237 2 const float* aPtr = aVector;
238
239 2 unsigned int number = 0;
240 2 const unsigned int eighthPoints = num_points / 8;
241
242 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
243 __m256 tmp, fx, mask, pow2n, z, y;
244 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
245 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
246 __m256i bias, exp, emm0, pi32_0x7f;
247
248 2 one = _mm256_set1_ps(1.0);
249 2 exp_hi = _mm256_set1_ps(88.3762626647949);
250 2 exp_lo = _mm256_set1_ps(-88.3762626647949);
251 2 ln2 = _mm256_set1_ps(0.6931471805);
252 2 log2EF = _mm256_set1_ps(1.44269504088896341);
253 2 half = _mm256_set1_ps(0.5);
254 2 exp_C1 = _mm256_set1_ps(0.693359375);
255 2 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
256 2 pi32_0x7f = _mm256_set1_epi32(0x7f);
257
258 2 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
259 2 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
260 2 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
261 2 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
262 2 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
263 2 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
264
265
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
266 // First compute the logarithm
267 32766 aVal = _mm256_load_ps(aPtr);
268 32766 bias = _mm256_set1_epi32(127);
269 32766 leadingOne = _mm256_set1_ps(1.0f);
270 163830 exp = _mm256_sub_epi32(
271 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
272 _mm256_set1_epi32(0x7f800000)),
273 23),
274 bias);
275 32766 logarithm = _mm256_cvtepi32_ps(exp);
276
277 131064 frac = _mm256_or_ps(
278 leadingOne,
279 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
280
281 #if POW_POLY_DEGREE == 6
282 mantissa = POLY5_AVX2(frac,
283 3.1157899f,
284 -3.3241990f,
285 2.5988452f,
286 -1.2315303f,
287 3.1821337e-1f,
288 -3.4436006e-2f);
289 #elif POW_POLY_DEGREE == 5
290 mantissa = POLY4_AVX2(frac,
291 2.8882704548164776201f,
292 -2.52074962577807006663f,
293 1.48116647521213171641f,
294 -0.465725644288844778798f,
295 0.0596515482674574969533f);
296 #elif POW_POLY_DEGREE == 4
297 mantissa = POLY3_AVX2(frac,
298 2.61761038894603480148f,
299 -1.75647175389045657003f,
300 0.688243882994381274313f,
301 -0.107254423828329604454f);
302 #elif POW_POLY_DEGREE == 3
303 229362 mantissa = POLY2_AVX2(frac,
304 2.28330284476918490682f,
305 -1.04913055217340124191f,
306 0.204446009836232697516f);
307 #else
308 #error
309 #endif
310
311 98298 logarithm = _mm256_add_ps(
312 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
313 32766 logarithm = _mm256_mul_ps(logarithm, ln2);
314
315 // Now calculate b*lna
316 32766 bVal = _mm256_load_ps(bPtr);
317 32766 bVal = _mm256_mul_ps(bVal, logarithm);
318
319 // Now compute exp(b*lna)
320 98298 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
321
322 65532 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
323
324 32766 emm0 = _mm256_cvttps_epi32(fx);
325 32766 tmp = _mm256_cvtepi32_ps(emm0);
326
327 65532 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
328 32766 fx = _mm256_sub_ps(tmp, mask);
329
330 65532 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
331 65532 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
332 32766 z = _mm256_mul_ps(bVal, bVal);
333
334 65532 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
335 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
336 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
337 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
338 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
339 65532 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
340 32766 y = _mm256_add_ps(y, one);
341
342 emm0 =
343 98298 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
344
345 32766 pow2n = _mm256_castsi256_ps(emm0);
346 32766 cVal = _mm256_mul_ps(y, pow2n);
347
348 _mm256_store_ps(cPtr, cVal);
349
350 32766 aPtr += 8;
351 32766 bPtr += 8;
352 32766 cPtr += 8;
353 }
354
355 2 number = eighthPoints * 8;
356
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
357 14 *cPtr++ = pow(*aPtr++, *bPtr++);
358 }
359 2 }
360
361 #endif /* LV_HAVE_AVX2 for aligned */
362
363
364 #ifdef LV_HAVE_SSE4_1
365 #include <smmintrin.h>
366
367 #define POLY0(x, c0) _mm_set1_ps(c0)
368 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
369 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
370 #define POLY3(x, c0, c1, c2, c3) \
371 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
372 #define POLY4(x, c0, c1, c2, c3, c4) \
373 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
374 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
375 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
376
377 2 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
378 const float* bVector,
379 const float* aVector,
380 unsigned int num_points)
381 {
382 2 float* cPtr = cVector;
383 2 const float* bPtr = bVector;
384 2 const float* aPtr = aVector;
385
386 2 unsigned int number = 0;
387 2 const unsigned int quarterPoints = num_points / 4;
388
389 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
390 __m128 tmp, fx, mask, pow2n, z, y;
391 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
392 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
393 __m128i bias, exp, emm0, pi32_0x7f;
394
395 2 one = _mm_set1_ps(1.0);
396 2 exp_hi = _mm_set1_ps(88.3762626647949);
397 2 exp_lo = _mm_set1_ps(-88.3762626647949);
398 2 ln2 = _mm_set1_ps(0.6931471805);
399 2 log2EF = _mm_set1_ps(1.44269504088896341);
400 2 half = _mm_set1_ps(0.5);
401 2 exp_C1 = _mm_set1_ps(0.693359375);
402 2 exp_C2 = _mm_set1_ps(-2.12194440e-4);
403 2 pi32_0x7f = _mm_set1_epi32(0x7f);
404
405 2 exp_p0 = _mm_set1_ps(1.9875691500e-4);
406 2 exp_p1 = _mm_set1_ps(1.3981999507e-3);
407 2 exp_p2 = _mm_set1_ps(8.3334519073e-3);
408 2 exp_p3 = _mm_set1_ps(4.1665795894e-2);
409 2 exp_p4 = _mm_set1_ps(1.6666665459e-1);
410 2 exp_p5 = _mm_set1_ps(5.0000001201e-1);
411
412
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
413 // First compute the logarithm
414 65534 aVal = _mm_load_ps(aPtr);
415 65534 bias = _mm_set1_epi32(127);
416 65534 leadingOne = _mm_set1_ps(1.0f);
417 327670 exp = _mm_sub_epi32(
418 _mm_srli_epi32(
419 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
420 bias);
421 65534 logarithm = _mm_cvtepi32_ps(exp);
422
423 262136 frac = _mm_or_ps(leadingOne,
424 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
425
426 #if POW_POLY_DEGREE == 6
427 mantissa = POLY5(frac,
428 3.1157899f,
429 -3.3241990f,
430 2.5988452f,
431 -1.2315303f,
432 3.1821337e-1f,
433 -3.4436006e-2f);
434 #elif POW_POLY_DEGREE == 5
435 mantissa = POLY4(frac,
436 2.8882704548164776201f,
437 -2.52074962577807006663f,
438 1.48116647521213171641f,
439 -0.465725644288844778798f,
440 0.0596515482674574969533f);
441 #elif POW_POLY_DEGREE == 4
442 mantissa = POLY3(frac,
443 2.61761038894603480148f,
444 -1.75647175389045657003f,
445 0.688243882994381274313f,
446 -0.107254423828329604454f);
447 #elif POW_POLY_DEGREE == 3
448 458738 mantissa = POLY2(frac,
449 2.28330284476918490682f,
450 -1.04913055217340124191f,
451 0.204446009836232697516f);
452 #else
453 #error
454 #endif
455
456 logarithm =
457 196602 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
458 65534 logarithm = _mm_mul_ps(logarithm, ln2);
459
460
461 // Now calculate b*lna
462 65534 bVal = _mm_load_ps(bPtr);
463 65534 bVal = _mm_mul_ps(bVal, logarithm);
464
465 // Now compute exp(b*lna)
466 131068 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
467
468 131068 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
469
470 65534 emm0 = _mm_cvttps_epi32(fx);
471 65534 tmp = _mm_cvtepi32_ps(emm0);
472
473 131068 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
474 65534 fx = _mm_sub_ps(tmp, mask);
475
476 65534 tmp = _mm_mul_ps(fx, exp_C1);
477 65534 z = _mm_mul_ps(fx, exp_C2);
478 131068 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
479 65534 z = _mm_mul_ps(bVal, bVal);
480
481 196602 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
482 196602 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
483 196602 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
484 196602 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
485 65534 y = _mm_add_ps(y, one);
486
487 196602 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
488
489 65534 pow2n = _mm_castsi128_ps(emm0);
490 65534 cVal = _mm_mul_ps(y, pow2n);
491
492 _mm_store_ps(cPtr, cVal);
493
494 65534 aPtr += 4;
495 65534 bPtr += 4;
496 65534 cPtr += 4;
497 }
498
499 2 number = quarterPoints * 4;
500
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
501 6 *cPtr++ = powf(*aPtr++, *bPtr++);
502 }
503 2 }
504
505 #endif /* LV_HAVE_SSE4_1 for aligned */
506
507 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
508
509 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
510 #define INCLUDED_volk_32f_x2_pow_32f_u_H
511
512 #include <inttypes.h>
513 #include <math.h>
514 #include <stdio.h>
515 #include <stdlib.h>
516
517 #define POW_POLY_DEGREE 3
518
519 #ifdef LV_HAVE_GENERIC
520
521 2 static inline void volk_32f_x2_pow_32f_generic(float* cVector,
522 const float* bVector,
523 const float* aVector,
524 unsigned int num_points)
525 {
526 2 float* cPtr = cVector;
527 2 const float* bPtr = bVector;
528 2 const float* aPtr = aVector;
529 2 unsigned int number = 0;
530
531
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
532 262142 *cPtr++ = powf(*aPtr++, *bPtr++);
533 }
534 2 }
535 #endif /* LV_HAVE_GENERIC */
536
537
538 #ifdef LV_HAVE_SSE4_1
539 #include <smmintrin.h>
540
541 #define POLY0(x, c0) _mm_set1_ps(c0)
542 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
543 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
544 #define POLY3(x, c0, c1, c2, c3) \
545 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
546 #define POLY4(x, c0, c1, c2, c3, c4) \
547 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
548 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
549 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
550
551 2 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
552 const float* bVector,
553 const float* aVector,
554 unsigned int num_points)
555 {
556 2 float* cPtr = cVector;
557 2 const float* bPtr = bVector;
558 2 const float* aPtr = aVector;
559
560 2 unsigned int number = 0;
561 2 const unsigned int quarterPoints = num_points / 4;
562
563 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
564 __m128 tmp, fx, mask, pow2n, z, y;
565 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
566 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
567 __m128i bias, exp, emm0, pi32_0x7f;
568
569 2 one = _mm_set1_ps(1.0);
570 2 exp_hi = _mm_set1_ps(88.3762626647949);
571 2 exp_lo = _mm_set1_ps(-88.3762626647949);
572 2 ln2 = _mm_set1_ps(0.6931471805);
573 2 log2EF = _mm_set1_ps(1.44269504088896341);
574 2 half = _mm_set1_ps(0.5);
575 2 exp_C1 = _mm_set1_ps(0.693359375);
576 2 exp_C2 = _mm_set1_ps(-2.12194440e-4);
577 2 pi32_0x7f = _mm_set1_epi32(0x7f);
578
579 2 exp_p0 = _mm_set1_ps(1.9875691500e-4);
580 2 exp_p1 = _mm_set1_ps(1.3981999507e-3);
581 2 exp_p2 = _mm_set1_ps(8.3334519073e-3);
582 2 exp_p3 = _mm_set1_ps(4.1665795894e-2);
583 2 exp_p4 = _mm_set1_ps(1.6666665459e-1);
584 2 exp_p5 = _mm_set1_ps(5.0000001201e-1);
585
586
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
587 // First compute the logarithm
588 65534 aVal = _mm_loadu_ps(aPtr);
589 65534 bias = _mm_set1_epi32(127);
590 65534 leadingOne = _mm_set1_ps(1.0f);
591 327670 exp = _mm_sub_epi32(
592 _mm_srli_epi32(
593 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
594 bias);
595 65534 logarithm = _mm_cvtepi32_ps(exp);
596
597 262136 frac = _mm_or_ps(leadingOne,
598 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
599
600 #if POW_POLY_DEGREE == 6
601 mantissa = POLY5(frac,
602 3.1157899f,
603 -3.3241990f,
604 2.5988452f,
605 -1.2315303f,
606 3.1821337e-1f,
607 -3.4436006e-2f);
608 #elif POW_POLY_DEGREE == 5
609 mantissa = POLY4(frac,
610 2.8882704548164776201f,
611 -2.52074962577807006663f,
612 1.48116647521213171641f,
613 -0.465725644288844778798f,
614 0.0596515482674574969533f);
615 #elif POW_POLY_DEGREE == 4
616 mantissa = POLY3(frac,
617 2.61761038894603480148f,
618 -1.75647175389045657003f,
619 0.688243882994381274313f,
620 -0.107254423828329604454f);
621 #elif POW_POLY_DEGREE == 3
622 458738 mantissa = POLY2(frac,
623 2.28330284476918490682f,
624 -1.04913055217340124191f,
625 0.204446009836232697516f);
626 #else
627 #error
628 #endif
629
630 logarithm =
631 196602 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
632 65534 logarithm = _mm_mul_ps(logarithm, ln2);
633
634
635 // Now calculate b*lna
636 65534 bVal = _mm_loadu_ps(bPtr);
637 65534 bVal = _mm_mul_ps(bVal, logarithm);
638
639 // Now compute exp(b*lna)
640 131068 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
641
642 131068 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
643
644 65534 emm0 = _mm_cvttps_epi32(fx);
645 65534 tmp = _mm_cvtepi32_ps(emm0);
646
647 131068 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
648 65534 fx = _mm_sub_ps(tmp, mask);
649
650 65534 tmp = _mm_mul_ps(fx, exp_C1);
651 65534 z = _mm_mul_ps(fx, exp_C2);
652 131068 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
653 65534 z = _mm_mul_ps(bVal, bVal);
654
655 196602 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
656 196602 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
657 196602 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
658 196602 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
659 65534 y = _mm_add_ps(y, one);
660
661 196602 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
662
663 65534 pow2n = _mm_castsi128_ps(emm0);
664 65534 cVal = _mm_mul_ps(y, pow2n);
665
666 _mm_storeu_ps(cPtr, cVal);
667
668 65534 aPtr += 4;
669 65534 bPtr += 4;
670 65534 cPtr += 4;
671 }
672
673 2 number = quarterPoints * 4;
674
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
675 6 *cPtr++ = powf(*aPtr++, *bPtr++);
676 }
677 2 }
678
679 #endif /* LV_HAVE_SSE4_1 for unaligned */
680
681 #if LV_HAVE_AVX2 && LV_HAVE_FMA
682 #include <immintrin.h>
683
684 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
685 #define POLY1_AVX2_FMA(x, c0, c1) \
686 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
687 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
688 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
689 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
690 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
691 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
692 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
693 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
694 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
695
696 2 static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
697 const float* bVector,
698 const float* aVector,
699 unsigned int num_points)
700 {
701 2 float* cPtr = cVector;
702 2 const float* bPtr = bVector;
703 2 const float* aPtr = aVector;
704
705 2 unsigned int number = 0;
706 2 const unsigned int eighthPoints = num_points / 8;
707
708 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
709 __m256 tmp, fx, mask, pow2n, z, y;
710 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
711 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
712 __m256i bias, exp, emm0, pi32_0x7f;
713
714 2 one = _mm256_set1_ps(1.0);
715 2 exp_hi = _mm256_set1_ps(88.3762626647949);
716 2 exp_lo = _mm256_set1_ps(-88.3762626647949);
717 2 ln2 = _mm256_set1_ps(0.6931471805);
718 2 log2EF = _mm256_set1_ps(1.44269504088896341);
719 2 half = _mm256_set1_ps(0.5);
720 2 exp_C1 = _mm256_set1_ps(0.693359375);
721 2 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
722 2 pi32_0x7f = _mm256_set1_epi32(0x7f);
723
724 2 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
725 2 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
726 2 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
727 2 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
728 2 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
729 2 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
730
731
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
732 // First compute the logarithm
733 32766 aVal = _mm256_loadu_ps(aPtr);
734 32766 bias = _mm256_set1_epi32(127);
735 32766 leadingOne = _mm256_set1_ps(1.0f);
736 163830 exp = _mm256_sub_epi32(
737 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
738 _mm256_set1_epi32(0x7f800000)),
739 23),
740 bias);
741 32766 logarithm = _mm256_cvtepi32_ps(exp);
742
743 131064 frac = _mm256_or_ps(
744 leadingOne,
745 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
746
747 #if POW_POLY_DEGREE == 6
748 mantissa = POLY5_AVX2_FMA(frac,
749 3.1157899f,
750 -3.3241990f,
751 2.5988452f,
752 -1.2315303f,
753 3.1821337e-1f,
754 -3.4436006e-2f);
755 #elif POW_POLY_DEGREE == 5
756 mantissa = POLY4_AVX2_FMA(frac,
757 2.8882704548164776201f,
758 -2.52074962577807006663f,
759 1.48116647521213171641f,
760 -0.465725644288844778798f,
761 0.0596515482674574969533f);
762 #elif POW_POLY_DEGREE == 4
763 mantissa = POLY3_AVX2_FMA(frac,
764 2.61761038894603480148f,
765 -1.75647175389045657003f,
766 0.688243882994381274313f,
767 -0.107254423828329604454f);
768 #elif POW_POLY_DEGREE == 3
769 163830 mantissa = POLY2_AVX2_FMA(frac,
770 2.28330284476918490682f,
771 -1.04913055217340124191f,
772 0.204446009836232697516f);
773 #else
774 #error
775 #endif
776
777 65532 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
778 32766 logarithm = _mm256_mul_ps(logarithm, ln2);
779
780
781 // Now calculate b*lna
782 32766 bVal = _mm256_loadu_ps(bPtr);
783 32766 bVal = _mm256_mul_ps(bVal, logarithm);
784
785 // Now compute exp(b*lna)
786 65532 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
787
788 32766 fx = _mm256_fmadd_ps(bVal, log2EF, half);
789
790 32766 emm0 = _mm256_cvttps_epi32(fx);
791 32766 tmp = _mm256_cvtepi32_ps(emm0);
792
793 65532 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
794 32766 fx = _mm256_sub_ps(tmp, mask);
795
796 32766 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
797 32766 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
798 32766 z = _mm256_mul_ps(bVal, bVal);
799
800 32766 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
801 32766 y = _mm256_fmadd_ps(y, bVal, exp_p2);
802 32766 y = _mm256_fmadd_ps(y, bVal, exp_p3);
803 32766 y = _mm256_fmadd_ps(y, bVal, exp_p4);
804 32766 y = _mm256_fmadd_ps(y, bVal, exp_p5);
805 32766 y = _mm256_fmadd_ps(y, z, bVal);
806 32766 y = _mm256_add_ps(y, one);
807
808 emm0 =
809 98298 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
810
811 32766 pow2n = _mm256_castsi256_ps(emm0);
812 32766 cVal = _mm256_mul_ps(y, pow2n);
813
814 _mm256_storeu_ps(cPtr, cVal);
815
816 32766 aPtr += 8;
817 32766 bPtr += 8;
818 32766 cPtr += 8;
819 }
820
821 2 number = eighthPoints * 8;
822
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
823 14 *cPtr++ = pow(*aPtr++, *bPtr++);
824 }
825 2 }
826
827 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
828
829 #ifdef LV_HAVE_AVX2
830 #include <immintrin.h>
831
832 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
833 #define POLY1_AVX2(x, c0, c1) \
834 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
835 #define POLY2_AVX2(x, c0, c1, c2) \
836 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
837 #define POLY3_AVX2(x, c0, c1, c2, c3) \
838 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
839 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
840 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
841 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
842 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
843
844 2 static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
845 const float* bVector,
846 const float* aVector,
847 unsigned int num_points)
848 {
849 2 float* cPtr = cVector;
850 2 const float* bPtr = bVector;
851 2 const float* aPtr = aVector;
852
853 2 unsigned int number = 0;
854 2 const unsigned int eighthPoints = num_points / 8;
855
856 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
857 __m256 tmp, fx, mask, pow2n, z, y;
858 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
859 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
860 __m256i bias, exp, emm0, pi32_0x7f;
861
862 2 one = _mm256_set1_ps(1.0);
863 2 exp_hi = _mm256_set1_ps(88.3762626647949);
864 2 exp_lo = _mm256_set1_ps(-88.3762626647949);
865 2 ln2 = _mm256_set1_ps(0.6931471805);
866 2 log2EF = _mm256_set1_ps(1.44269504088896341);
867 2 half = _mm256_set1_ps(0.5);
868 2 exp_C1 = _mm256_set1_ps(0.693359375);
869 2 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
870 2 pi32_0x7f = _mm256_set1_epi32(0x7f);
871
872 2 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
873 2 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
874 2 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
875 2 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
876 2 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
877 2 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
878
879
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
880 // First compute the logarithm
881 32766 aVal = _mm256_loadu_ps(aPtr);
882 32766 bias = _mm256_set1_epi32(127);
883 32766 leadingOne = _mm256_set1_ps(1.0f);
884 163830 exp = _mm256_sub_epi32(
885 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
886 _mm256_set1_epi32(0x7f800000)),
887 23),
888 bias);
889 32766 logarithm = _mm256_cvtepi32_ps(exp);
890
891 131064 frac = _mm256_or_ps(
892 leadingOne,
893 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
894
895 #if POW_POLY_DEGREE == 6
896 mantissa = POLY5_AVX2(frac,
897 3.1157899f,
898 -3.3241990f,
899 2.5988452f,
900 -1.2315303f,
901 3.1821337e-1f,
902 -3.4436006e-2f);
903 #elif POW_POLY_DEGREE == 5
904 mantissa = POLY4_AVX2(frac,
905 2.8882704548164776201f,
906 -2.52074962577807006663f,
907 1.48116647521213171641f,
908 -0.465725644288844778798f,
909 0.0596515482674574969533f);
910 #elif POW_POLY_DEGREE == 4
911 mantissa = POLY3_AVX2(frac,
912 2.61761038894603480148f,
913 -1.75647175389045657003f,
914 0.688243882994381274313f,
915 -0.107254423828329604454f);
916 #elif POW_POLY_DEGREE == 3
917 229362 mantissa = POLY2_AVX2(frac,
918 2.28330284476918490682f,
919 -1.04913055217340124191f,
920 0.204446009836232697516f);
921 #else
922 #error
923 #endif
924
925 98298 logarithm = _mm256_add_ps(
926 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
927 32766 logarithm = _mm256_mul_ps(logarithm, ln2);
928
929 // Now calculate b*lna
930 32766 bVal = _mm256_loadu_ps(bPtr);
931 32766 bVal = _mm256_mul_ps(bVal, logarithm);
932
933 // Now compute exp(b*lna)
934 65532 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
935
936 65532 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
937
938 32766 emm0 = _mm256_cvttps_epi32(fx);
939 32766 tmp = _mm256_cvtepi32_ps(emm0);
940
941 65532 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
942 32766 fx = _mm256_sub_ps(tmp, mask);
943
944 65532 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
945 65532 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
946 32766 z = _mm256_mul_ps(bVal, bVal);
947
948 65532 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
949 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
950 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
951 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
952 65532 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
953 65532 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
954 32766 y = _mm256_add_ps(y, one);
955
956 emm0 =
957 98298 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
958
959 32766 pow2n = _mm256_castsi256_ps(emm0);
960 32766 cVal = _mm256_mul_ps(y, pow2n);
961
962 _mm256_storeu_ps(cPtr, cVal);
963
964 32766 aPtr += 8;
965 32766 bPtr += 8;
966 32766 cPtr += 8;
967 }
968
969 2 number = eighthPoints * 8;
970
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
971 14 *cPtr++ = pow(*aPtr++, *bPtr++);
972 }
973 2 }
974
975 #endif /* LV_HAVE_AVX2 for unaligned */
976
977 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
978