GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_s32f_multiply_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 81 81 100.0%
Functions: 7 7 100.0%
Branches: 20 20 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_s32f_multiply_32f
12 *
13 * \b Overview
14 *
15 * Multiplies a floating point vector by a floating point scalar.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float
20 * scalar, unsigned int num_points) \endcode
21 *
22 * \b Inputs
23 * \li aVector: The input vector of floats.
24 * \li scalar: the scalar value to multiply against \p aVector.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li cVector: The output vector of floats.
29 *
30 * \b Example
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 *
38 * for(unsigned int ii = 0; ii < N; ++ii){
39 * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
40 * }
41 *
42 * // Normalize by the smallest delta (0.2 in this example)
43 * float scale = 5.0f;
44 *
45 * volk_32f_s32f_multiply_32f(out, increasing, scale, N);
46 *
47 * for(unsigned int ii = 0; ii < N; ++ii){
48 * printf("out[%u] = %f\n", ii, out[ii]);
49 * }
50 *
51 * volk_free(increasing);
52 * volk_free(out);
53 * \endcode
54 */
55
56 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
58
59 #include <inttypes.h>
60 #include <stdio.h>
61
62 #ifdef LV_HAVE_SSE
63 #include <xmmintrin.h>
64
65 2 static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
66 const float* aVector,
67 const float scalar,
68 unsigned int num_points)
69 {
70 2 unsigned int number = 0;
71 2 const unsigned int quarterPoints = num_points / 4;
72
73 2 float* cPtr = cVector;
74 2 const float* aPtr = aVector;
75
76 __m128 aVal, bVal, cVal;
77 2 bVal = _mm_set_ps1(scalar);
78
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
79 65534 aVal = _mm_loadu_ps(aPtr);
80
81 65534 cVal = _mm_mul_ps(aVal, bVal);
82
83 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
84
85 65534 aPtr += 4;
86 65534 cPtr += 4;
87 }
88
89 2 number = quarterPoints * 4;
90
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
91 6 *cPtr++ = (*aPtr++) * scalar;
92 }
93 2 }
94 #endif /* LV_HAVE_SSE */
95
96 #ifdef LV_HAVE_AVX
97 #include <immintrin.h>
98
99 2 static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
100 const float* aVector,
101 const float scalar,
102 unsigned int num_points)
103 {
104 2 unsigned int number = 0;
105 2 const unsigned int eighthPoints = num_points / 8;
106
107 2 float* cPtr = cVector;
108 2 const float* aPtr = aVector;
109
110 __m256 aVal, bVal, cVal;
111 2 bVal = _mm256_set1_ps(scalar);
112
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
113
114 32766 aVal = _mm256_loadu_ps(aPtr);
115
116 32766 cVal = _mm256_mul_ps(aVal, bVal);
117
118 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
119
120 32766 aPtr += 8;
121 32766 cPtr += 8;
122 }
123
124 2 number = eighthPoints * 8;
125
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
126 14 *cPtr++ = (*aPtr++) * scalar;
127 }
128 2 }
129 #endif /* LV_HAVE_AVX */
130
131 #ifdef LV_HAVE_RISCV64
132 extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
133 const float* aVector,
134 const float scalar,
135 unsigned int num_points);
136 #endif /* LV_HAVE_RISCV64 */
137
138 #ifdef LV_HAVE_GENERIC
139 2 static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
140 const float* aVector,
141 const float scalar,
142 unsigned int num_points)
143 {
144 2 unsigned int number = 0;
145 2 const float* inputPtr = aVector;
146 2 float* outputPtr = cVector;
147
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
148 262142 *outputPtr = (*inputPtr) * scalar;
149 262142 inputPtr++;
150 262142 outputPtr++;
151 }
152 2 }
153 #endif /* LV_HAVE_GENERIC */
154
155 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
156
157
158 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
159 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
160
161 #include <inttypes.h>
162 #include <stdio.h>
163
164 #ifdef LV_HAVE_SSE
165 #include <xmmintrin.h>
166
167 2 static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
168 const float* aVector,
169 const float scalar,
170 unsigned int num_points)
171 {
172 2 unsigned int number = 0;
173 2 const unsigned int quarterPoints = num_points / 4;
174
175 2 float* cPtr = cVector;
176 2 const float* aPtr = aVector;
177
178 __m128 aVal, bVal, cVal;
179 2 bVal = _mm_set_ps1(scalar);
180
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
181 65534 aVal = _mm_load_ps(aPtr);
182
183 65534 cVal = _mm_mul_ps(aVal, bVal);
184
185 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
186
187 65534 aPtr += 4;
188 65534 cPtr += 4;
189 }
190
191 2 number = quarterPoints * 4;
192
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
193 6 *cPtr++ = (*aPtr++) * scalar;
194 }
195 2 }
196 #endif /* LV_HAVE_SSE */
197
198 #ifdef LV_HAVE_AVX
199 #include <immintrin.h>
200
201 6 static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
202 const float* aVector,
203 const float scalar,
204 unsigned int num_points)
205 {
206 6 unsigned int number = 0;
207 6 const unsigned int eighthPoints = num_points / 8;
208
209 6 float* cPtr = cVector;
210 6 const float* aPtr = aVector;
211
212 __m256 aVal, bVal, cVal;
213 6 bVal = _mm256_set1_ps(scalar);
214
2/2
✓ Branch 0 taken 98298 times.
✓ Branch 1 taken 6 times.
98304 for (; number < eighthPoints; number++) {
215 98298 aVal = _mm256_load_ps(aPtr);
216
217 98298 cVal = _mm256_mul_ps(aVal, bVal);
218
219 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
220
221 98298 aPtr += 8;
222 98298 cPtr += 8;
223 }
224
225 6 number = eighthPoints * 8;
226
2/2
✓ Branch 0 taken 42 times.
✓ Branch 1 taken 6 times.
48 for (; number < num_points; number++) {
227 42 *cPtr++ = (*aPtr++) * scalar;
228 }
229 6 }
230 #endif /* LV_HAVE_AVX */
231
232 #ifdef LV_HAVE_NEON
233 #include <arm_neon.h>
234
235 static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
236 const float* aVector,
237 const float scalar,
238 unsigned int num_points)
239 {
240 unsigned int number = 0;
241 const float* inputPtr = aVector;
242 float* outputPtr = cVector;
243 const unsigned int quarterPoints = num_points / 4;
244
245 float32x4_t aVal, cVal;
246
247 for (number = 0; number < quarterPoints; number++) {
248 aVal = vld1q_f32(inputPtr); // Load into NEON regs
249 cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
250 vst1q_f32(outputPtr, cVal); // Store results back to output
251 inputPtr += 4;
252 outputPtr += 4;
253 }
254 for (number = quarterPoints * 4; number < num_points; number++) {
255 *outputPtr++ = (*inputPtr++) * scalar;
256 }
257 }
258 #endif /* LV_HAVE_NEON */
259
260
261 #ifdef LV_HAVE_GENERIC
262
263 2 static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
264 const float* aVector,
265 const float scalar,
266 unsigned int num_points)
267 {
268 2 unsigned int number = 0;
269 2 const float* inputPtr = aVector;
270 2 float* outputPtr = cVector;
271
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
272 262142 *outputPtr = (*inputPtr) * scalar;
273 262142 inputPtr++;
274 262142 outputPtr++;
275 }
276 2 }
277 #endif /* LV_HAVE_GENERIC */
278
279
280 #ifdef LV_HAVE_ORC
281
282 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
283 const float* src,
284 const float scalar,
285 unsigned int num_points);
286
287 2 static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
288 const float* aVector,
289 const float scalar,
290 unsigned int num_points)
291 {
292 2 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
293 2 }
294
295 #endif /* LV_HAVE_GENERIC */
296
297 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
298