Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_s32f_power_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Takes each input vector value to the specified power and stores the | ||
16 | * results in the return vector. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, | ||
21 | * unsigned int num_points) \endcode | ||
22 | * | ||
23 | * \b Inputs | ||
24 | * \li aVector: The input vector of floats. | ||
25 | * \li power: The power to raise the input value to. | ||
26 | * \li num_points: The number of data points. | ||
27 | * | ||
28 | * \b Outputs | ||
29 | * \li cVector: The output vector. | ||
30 | * | ||
31 | * \b Example | ||
32 | * Square the numbers (0,9) | ||
33 | * \code | ||
34 | * int N = 10; | ||
35 | * unsigned int alignment = volk_get_alignment(); | ||
36 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
37 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
38 | * | ||
39 | * | ||
40 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
41 | * increasing[ii] = (float)ii; | ||
42 | * } | ||
43 | * | ||
44 | * // Normalize by the smallest delta (0.2 in this example) | ||
45 | * float scale = 2.0f; | ||
46 | * | ||
47 | * volk_32f_s32f_power_32f(out, increasing, scale, N); | ||
48 | * | ||
49 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
50 | * printf("out[%u] = %f\n", ii, out[ii]); | ||
51 | * } | ||
52 | * | ||
53 | * volk_free(increasing); | ||
54 | * volk_free(out); | ||
55 | * \endcode | ||
56 | */ | ||
57 | |||
58 | #ifndef INCLUDED_volk_32f_s32f_power_32f_a_H | ||
59 | #define INCLUDED_volk_32f_s32f_power_32f_a_H | ||
60 | |||
61 | #include <inttypes.h> | ||
62 | #include <math.h> | ||
63 | #include <stdio.h> | ||
64 | |||
65 | #ifdef LV_HAVE_SSE4_1 | ||
66 | #include <tmmintrin.h> | ||
67 | |||
68 | #ifdef LV_HAVE_LIB_SIMDMATH | ||
69 | #include <simdmath.h> | ||
70 | #endif /* LV_HAVE_LIB_SIMDMATH */ | ||
71 | |||
72 | 2 | static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, | |
73 | const float* aVector, | ||
74 | const float power, | ||
75 | unsigned int num_points) | ||
76 | { | ||
77 | 2 | unsigned int number = 0; | |
78 | |||
79 | 2 | float* cPtr = cVector; | |
80 | 2 | const float* aPtr = aVector; | |
81 | |||
82 | #ifdef LV_HAVE_LIB_SIMDMATH | ||
83 | const unsigned int quarterPoints = num_points / 4; | ||
84 | __m128 vPower = _mm_set_ps1(power); | ||
85 | __m128 zeroValue = _mm_setzero_ps(); | ||
86 | __m128 signMask; | ||
87 | __m128 negatedValues; | ||
88 | __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); | ||
89 | __m128 onesMask = _mm_set_ps1(1); | ||
90 | |||
91 | __m128 aVal, cVal; | ||
92 | for (; number < quarterPoints; number++) { | ||
93 | |||
94 | aVal = _mm_load_ps(aPtr); | ||
95 | signMask = _mm_cmplt_ps(aVal, zeroValue); | ||
96 | negatedValues = _mm_sub_ps(zeroValue, aVal); | ||
97 | aVal = _mm_blendv_ps(aVal, negatedValues, signMask); | ||
98 | |||
99 | // powf4 doesn't support negative values in the base, so we mask them off and then | ||
100 | // apply the negative after | ||
101 | cVal = powf4(aVal, vPower); // Takes each input value to the specified power | ||
102 | |||
103 | cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); | ||
104 | |||
105 | _mm_store_ps(cPtr, cVal); // Store the results back into the C container | ||
106 | |||
107 | aPtr += 4; | ||
108 | cPtr += 4; | ||
109 | } | ||
110 | |||
111 | number = quarterPoints * 4; | ||
112 | #endif /* LV_HAVE_LIB_SIMDMATH */ | ||
113 | |||
114 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
115 | 262142 | *cPtr++ = powf((*aPtr++), power); | |
116 | } | ||
117 | 2 | } | |
118 | |||
119 | #endif /* LV_HAVE_SSE4_1 */ | ||
120 | |||
121 | |||
122 | #ifdef LV_HAVE_SSE | ||
123 | #include <xmmintrin.h> | ||
124 | |||
125 | #ifdef LV_HAVE_LIB_SIMDMATH | ||
126 | #include <simdmath.h> | ||
127 | #endif /* LV_HAVE_LIB_SIMDMATH */ | ||
128 | |||
129 | 2 | static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, | |
130 | const float* aVector, | ||
131 | const float power, | ||
132 | unsigned int num_points) | ||
133 | { | ||
134 | 2 | unsigned int number = 0; | |
135 | |||
136 | 2 | float* cPtr = cVector; | |
137 | 2 | const float* aPtr = aVector; | |
138 | |||
139 | #ifdef LV_HAVE_LIB_SIMDMATH | ||
140 | const unsigned int quarterPoints = num_points / 4; | ||
141 | __m128 vPower = _mm_set_ps1(power); | ||
142 | __m128 zeroValue = _mm_setzero_ps(); | ||
143 | __m128 signMask; | ||
144 | __m128 negatedValues; | ||
145 | __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); | ||
146 | __m128 onesMask = _mm_set_ps1(1); | ||
147 | |||
148 | __m128 aVal, cVal; | ||
149 | for (; number < quarterPoints; number++) { | ||
150 | |||
151 | aVal = _mm_load_ps(aPtr); | ||
152 | signMask = _mm_cmplt_ps(aVal, zeroValue); | ||
153 | negatedValues = _mm_sub_ps(zeroValue, aVal); | ||
154 | aVal = | ||
155 | _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues)); | ||
156 | |||
157 | // powf4 doesn't support negative values in the base, so we mask them off and then | ||
158 | // apply the negative after | ||
159 | cVal = powf4(aVal, vPower); // Takes each input value to the specified power | ||
160 | |||
161 | cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask), | ||
162 | _mm_and_ps(signMask, negativeOneToPower)), | ||
163 | cVal); | ||
164 | |||
165 | _mm_store_ps(cPtr, cVal); // Store the results back into the C container | ||
166 | |||
167 | aPtr += 4; | ||
168 | cPtr += 4; | ||
169 | } | ||
170 | |||
171 | number = quarterPoints * 4; | ||
172 | #endif /* LV_HAVE_LIB_SIMDMATH */ | ||
173 | |||
174 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
175 | 262142 | *cPtr++ = powf((*aPtr++), power); | |
176 | } | ||
177 | 2 | } | |
178 | |||
179 | #endif /* LV_HAVE_SSE */ | ||
180 | |||
181 | |||
182 | #ifdef LV_HAVE_GENERIC | ||
183 | |||
184 | 2 | static inline void volk_32f_s32f_power_32f_generic(float* cVector, | |
185 | const float* aVector, | ||
186 | const float power, | ||
187 | unsigned int num_points) | ||
188 | { | ||
189 | 2 | float* cPtr = cVector; | |
190 | 2 | const float* aPtr = aVector; | |
191 | 2 | unsigned int number = 0; | |
192 | |||
193 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
194 | 262142 | *cPtr++ = powf((*aPtr++), power); | |
195 | } | ||
196 | 2 | } | |
197 | #endif /* LV_HAVE_GENERIC */ | ||
198 | |||
199 | |||
200 | #endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */ | ||
201 |