Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2020 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_s32f_add_32f | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Adds a floating point scalar to a floating point vector. | ||
16 | * | ||
17 | * <b>Dispatcher Prototype</b> | ||
18 | * \code | ||
19 | * void volk_32f_s32f_add_32f(float* cVector, const float* aVector, const float scalar, | ||
20 | * unsigned int num_points) \endcode | ||
21 | * | ||
22 | * \b Inputs | ||
23 | * \li aVector: The input vector of floats. | ||
24 | * \li scalar: the scalar value to add against \p aVector. | ||
25 | * \li num_points: The number of data points. | ||
26 | * | ||
27 | * \b Outputs | ||
28 | * \li cVector: The output vector of floats. | ||
29 | * | ||
30 | * \b Example | ||
31 | * \code | ||
32 | * int N = 10; | ||
33 | * unsigned int alignment = volk_get_alignment(); | ||
34 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
35 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); | ||
36 | * | ||
37 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
38 | * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f; | ||
39 | * } | ||
40 | * | ||
41 | * // Add addshift to each entry. | ||
42 | * float addshift = 5.0f; | ||
43 | * | ||
44 | * volk_32f_s32f_add_32f(out, increasing, addshift, N); | ||
45 | * | ||
46 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
47 | * printf("out[%u] = %f\n", ii, out[ii]); | ||
48 | * } | ||
49 | * | ||
50 | * volk_free(increasing); | ||
51 | * volk_free(out); | ||
52 | * \endcode | ||
53 | */ | ||
54 | |||
55 | #include <inttypes.h> | ||
56 | #include <stdio.h> | ||
57 | |||
58 | #ifndef INCLUDED_volk_32f_s32f_add_32f_u_H | ||
59 | #define INCLUDED_volk_32f_s32f_add_32f_u_H | ||
60 | |||
61 | #ifdef LV_HAVE_GENERIC | ||
62 | |||
63 | 10 | static inline void volk_32f_s32f_add_32f_generic(float* cVector, | |
64 | const float* aVector, | ||
65 | const float scalar, | ||
66 | unsigned int num_points) | ||
67 | { | ||
68 | 10 | unsigned int number = 0; | |
69 | 10 | const float* inputPtr = aVector; | |
70 | 10 | float* outputPtr = cVector; | |
71 |
2/2✓ Branch 0 taken 262182 times.
✓ Branch 1 taken 10 times.
|
262192 | for (number = 0; number < num_points; number++) { |
72 | 262182 | *outputPtr = (*inputPtr) + scalar; | |
73 | 262182 | inputPtr++; | |
74 | 262182 | outputPtr++; | |
75 | } | ||
76 | 10 | } | |
77 | |||
78 | #endif /* LV_HAVE_GENERIC */ | ||
79 | #ifdef LV_HAVE_SSE | ||
80 | #include <xmmintrin.h> | ||
81 | |||
82 | 2 | static inline void volk_32f_s32f_add_32f_u_sse(float* cVector, | |
83 | const float* aVector, | ||
84 | const float scalar, | ||
85 | unsigned int num_points) | ||
86 | { | ||
87 | 2 | unsigned int number = 0; | |
88 | 2 | const unsigned int quarterPoints = num_points / 4; | |
89 | |||
90 | 2 | float* cPtr = cVector; | |
91 | 2 | const float* aPtr = aVector; | |
92 | |||
93 | __m128 aVal, bVal, cVal; | ||
94 | 2 | bVal = _mm_set_ps1(scalar); | |
95 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
96 | 65534 | aVal = _mm_loadu_ps(aPtr); | |
97 | |||
98 | 65534 | cVal = _mm_add_ps(aVal, bVal); | |
99 | |||
100 | _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
101 | |||
102 | 65534 | aPtr += 4; | |
103 | 65534 | cPtr += 4; | |
104 | } | ||
105 | |||
106 | 2 | number = quarterPoints * 4; | |
107 | 2 | volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number); | |
108 | 2 | } | |
109 | #endif /* LV_HAVE_SSE */ | ||
110 | |||
111 | #ifdef LV_HAVE_AVX | ||
112 | #include <immintrin.h> | ||
113 | |||
114 | 2 | static inline void volk_32f_s32f_add_32f_u_avx(float* cVector, | |
115 | const float* aVector, | ||
116 | const float scalar, | ||
117 | unsigned int num_points) | ||
118 | { | ||
119 | 2 | unsigned int number = 0; | |
120 | 2 | const unsigned int eighthPoints = num_points / 8; | |
121 | |||
122 | 2 | float* cPtr = cVector; | |
123 | 2 | const float* aPtr = aVector; | |
124 | |||
125 | __m256 aVal, bVal, cVal; | ||
126 | 2 | bVal = _mm256_set1_ps(scalar); | |
127 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
128 | |||
129 | 32766 | aVal = _mm256_loadu_ps(aPtr); | |
130 | |||
131 | 32766 | cVal = _mm256_add_ps(aVal, bVal); | |
132 | |||
133 | _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container | ||
134 | |||
135 | 32766 | aPtr += 8; | |
136 | 32766 | cPtr += 8; | |
137 | } | ||
138 | |||
139 | 2 | number = eighthPoints * 8; | |
140 | 2 | volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number); | |
141 | 2 | } | |
142 | #endif /* LV_HAVE_AVX */ | ||
143 | |||
144 | #ifdef LV_HAVE_NEON | ||
145 | #include <arm_neon.h> | ||
146 | |||
147 | static inline void volk_32f_s32f_add_32f_u_neon(float* cVector, | ||
148 | const float* aVector, | ||
149 | const float scalar, | ||
150 | unsigned int num_points) | ||
151 | { | ||
152 | unsigned int number = 0; | ||
153 | const float* inputPtr = aVector; | ||
154 | float* outputPtr = cVector; | ||
155 | const unsigned int quarterPoints = num_points / 4; | ||
156 | |||
157 | float32x4_t aVal, cVal, scalarvec; | ||
158 | |||
159 | scalarvec = vdupq_n_f32(scalar); | ||
160 | |||
161 | for (number = 0; number < quarterPoints; number++) { | ||
162 | aVal = vld1q_f32(inputPtr); // Load into NEON regs | ||
163 | cVal = vaddq_f32(aVal, scalarvec); // Do the add | ||
164 | vst1q_f32(outputPtr, cVal); // Store results back to output | ||
165 | inputPtr += 4; | ||
166 | outputPtr += 4; | ||
167 | } | ||
168 | |||
169 | number = quarterPoints * 4; | ||
170 | volk_32f_s32f_add_32f_generic(outputPtr, inputPtr, scalar, num_points - number); | ||
171 | } | ||
172 | #endif /* LV_HAVE_NEON */ | ||
173 | |||
174 | |||
175 | #endif /* INCLUDED_volk_32f_s32f_add_32f_u_H */ | ||
176 | |||
177 | |||
178 | #ifndef INCLUDED_volk_32f_s32f_add_32f_a_H | ||
179 | #define INCLUDED_volk_32f_s32f_add_32f_a_H | ||
180 | |||
181 | #ifdef LV_HAVE_SSE | ||
182 | #include <xmmintrin.h> | ||
183 | |||
184 | 2 | static inline void volk_32f_s32f_add_32f_a_sse(float* cVector, | |
185 | const float* aVector, | ||
186 | const float scalar, | ||
187 | unsigned int num_points) | ||
188 | { | ||
189 | 2 | unsigned int number = 0; | |
190 | 2 | const unsigned int quarterPoints = num_points / 4; | |
191 | |||
192 | 2 | float* cPtr = cVector; | |
193 | 2 | const float* aPtr = aVector; | |
194 | |||
195 | __m128 aVal, bVal, cVal; | ||
196 | 2 | bVal = _mm_set_ps1(scalar); | |
197 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
198 | 65534 | aVal = _mm_load_ps(aPtr); | |
199 | |||
200 | 65534 | cVal = _mm_add_ps(aVal, bVal); | |
201 | |||
202 | _mm_store_ps(cPtr, cVal); // Store the results back into the C container | ||
203 | |||
204 | 65534 | aPtr += 4; | |
205 | 65534 | cPtr += 4; | |
206 | } | ||
207 | |||
208 | 2 | number = quarterPoints * 4; | |
209 | 2 | volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number); | |
210 | 2 | } | |
211 | #endif /* LV_HAVE_SSE */ | ||
212 | |||
213 | #ifdef LV_HAVE_AVX | ||
214 | #include <immintrin.h> | ||
215 | |||
216 | 2 | static inline void volk_32f_s32f_add_32f_a_avx(float* cVector, | |
217 | const float* aVector, | ||
218 | const float scalar, | ||
219 | unsigned int num_points) | ||
220 | { | ||
221 | 2 | unsigned int number = 0; | |
222 | 2 | const unsigned int eighthPoints = num_points / 8; | |
223 | |||
224 | 2 | float* cPtr = cVector; | |
225 | 2 | const float* aPtr = aVector; | |
226 | |||
227 | __m256 aVal, bVal, cVal; | ||
228 | 2 | bVal = _mm256_set1_ps(scalar); | |
229 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
230 | 32766 | aVal = _mm256_load_ps(aPtr); | |
231 | |||
232 | 32766 | cVal = _mm256_add_ps(aVal, bVal); | |
233 | |||
234 | _mm256_store_ps(cPtr, cVal); // Store the results back into the C container | ||
235 | |||
236 | 32766 | aPtr += 8; | |
237 | 32766 | cPtr += 8; | |
238 | } | ||
239 | |||
240 | 2 | number = eighthPoints * 8; | |
241 | 2 | volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number); | |
242 | 2 | } | |
243 | #endif /* LV_HAVE_AVX */ | ||
244 | |||
245 | #ifdef LV_HAVE_ORC | ||
246 | |||
247 | extern void volk_32f_s32f_add_32f_a_orc_impl(float* dst, | ||
248 | const float* src, | ||
249 | const float scalar, | ||
250 | unsigned int num_points); | ||
251 | |||
252 | 2 | static inline void volk_32f_s32f_add_32f_u_orc(float* cVector, | |
253 | const float* aVector, | ||
254 | const float scalar, | ||
255 | unsigned int num_points) | ||
256 | { | ||
257 | 2 | volk_32f_s32f_add_32f_a_orc_impl(cVector, aVector, scalar, num_points); | |
258 | 2 | } | |
259 | #endif /* LV_HAVE_ORC */ | ||
260 | |||
261 | #endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */ | ||
262 |