GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_x2_multiply_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 87 121 71.9%
Functions: 7 9 77.8%
Branches: 20 28 71.4%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_x2_multiply_32f
12 *
13 * \b Overview
14 *
15 * Multiplies two input floating point vectors together.
16 *
17 * c[i] = a[i] * b[i]
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float*
22 * bVector, unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li aVector: First input vector.
26 * \li bVector: Second input vector.
27 * \li num_points: The number of values in both input vectors.
28 *
29 * \b Outputs
30 * \li cVector: The output vector.
31 *
32 * \b Example
33 * Multiply elements of an increasing vector by those of a decreasing vector.
34 * \code
35 * int N = 10;
36 * unsigned int alignment = volk_get_alignment();
37 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
38 * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
39 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
40 *
41 * for(unsigned int ii = 0; ii < N; ++ii){
42 * increasing[ii] = (float)ii;
43 * decreasing[ii] = 10.f - (float)ii;
44 * }
45 *
46 * volk_32f_x2_multiply_32f(out, increasing, decreasing, N);
47 *
48 * for(unsigned int ii = 0; ii < N; ++ii){
49 * printf("out[%u] = %1.2f\n", ii, out[ii]);
50 * }
51 *
52 * volk_free(increasing);
53 * volk_free(decreasing);
54 * volk_free(out);
55 * \endcode
56 */
57
58 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
59 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
60
61 #include <inttypes.h>
62 #include <stdio.h>
63
64 #ifdef LV_HAVE_SSE
65 #include <xmmintrin.h>
66
67 2 static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
68 const float* aVector,
69 const float* bVector,
70 unsigned int num_points)
71 {
72 2 unsigned int number = 0;
73 2 const unsigned int quarterPoints = num_points / 4;
74
75 2 float* cPtr = cVector;
76 2 const float* aPtr = aVector;
77 2 const float* bPtr = bVector;
78
79 __m128 aVal, bVal, cVal;
80
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
81
82 65534 aVal = _mm_loadu_ps(aPtr);
83 65534 bVal = _mm_loadu_ps(bPtr);
84
85 65534 cVal = _mm_mul_ps(aVal, bVal);
86
87 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
88
89 65534 aPtr += 4;
90 65534 bPtr += 4;
91 65534 cPtr += 4;
92 }
93
94 2 number = quarterPoints * 4;
95
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
96 6 *cPtr++ = (*aPtr++) * (*bPtr++);
97 }
98 2 }
99 #endif /* LV_HAVE_SSE */
100
101 #ifdef LV_HAVE_AVX512F
102 #include <immintrin.h>
103
104 static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
108 {
109 unsigned int number = 0;
110 const unsigned int sixteenthPoints = num_points / 16;
111
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
115
116 __m512 aVal, bVal, cVal;
117 for (; number < sixteenthPoints; number++) {
118
119 aVal = _mm512_loadu_ps(aPtr);
120 bVal = _mm512_loadu_ps(bPtr);
121
122 cVal = _mm512_mul_ps(aVal, bVal);
123
124 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
125
126 aPtr += 16;
127 bPtr += 16;
128 cPtr += 16;
129 }
130
131 number = sixteenthPoints * 16;
132 for (; number < num_points; number++) {
133 *cPtr++ = (*aPtr++) * (*bPtr++);
134 }
135 }
136 #endif /* LV_HAVE_AVX512F */
137
138 #ifdef LV_HAVE_AVX
139 #include <immintrin.h>
140
141 2 static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
145 {
146 2 unsigned int number = 0;
147 2 const unsigned int eighthPoints = num_points / 8;
148
149 2 float* cPtr = cVector;
150 2 const float* aPtr = aVector;
151 2 const float* bPtr = bVector;
152
153 __m256 aVal, bVal, cVal;
154
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
155
156 32766 aVal = _mm256_loadu_ps(aPtr);
157 32766 bVal = _mm256_loadu_ps(bPtr);
158
159 32766 cVal = _mm256_mul_ps(aVal, bVal);
160
161 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
162
163 32766 aPtr += 8;
164 32766 bPtr += 8;
165 32766 cPtr += 8;
166 }
167
168 2 number = eighthPoints * 8;
169
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
170 14 *cPtr++ = (*aPtr++) * (*bPtr++);
171 }
172 2 }
173 #endif /* LV_HAVE_AVX */
174
175
176 #ifdef LV_HAVE_GENERIC
177
178 2 static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
182 {
183 2 float* cPtr = cVector;
184 2 const float* aPtr = aVector;
185 2 const float* bPtr = bVector;
186 2 unsigned int number = 0;
187
188
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
189 262142 *cPtr++ = (*aPtr++) * (*bPtr++);
190 }
191 2 }
192 #endif /* LV_HAVE_GENERIC */
193
194
195 #endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
196
197
198 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
199 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
200
201 #include <inttypes.h>
202 #include <stdio.h>
203
204 #ifdef LV_HAVE_SSE
205 #include <xmmintrin.h>
206
207 2 static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
208 const float* aVector,
209 const float* bVector,
210 unsigned int num_points)
211 {
212 2 unsigned int number = 0;
213 2 const unsigned int quarterPoints = num_points / 4;
214
215 2 float* cPtr = cVector;
216 2 const float* aPtr = aVector;
217 2 const float* bPtr = bVector;
218
219 __m128 aVal, bVal, cVal;
220
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
221
222 65534 aVal = _mm_load_ps(aPtr);
223 65534 bVal = _mm_load_ps(bPtr);
224
225 65534 cVal = _mm_mul_ps(aVal, bVal);
226
227 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
228
229 65534 aPtr += 4;
230 65534 bPtr += 4;
231 65534 cPtr += 4;
232 }
233
234 2 number = quarterPoints * 4;
235
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
236 6 *cPtr++ = (*aPtr++) * (*bPtr++);
237 }
238 2 }
239 #endif /* LV_HAVE_SSE */
240
241 #ifdef LV_HAVE_AVX512F
242 #include <immintrin.h>
243
244 static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
245 const float* aVector,
246 const float* bVector,
247 unsigned int num_points)
248 {
249 unsigned int number = 0;
250 const unsigned int sixteenthPoints = num_points / 16;
251
252 float* cPtr = cVector;
253 const float* aPtr = aVector;
254 const float* bPtr = bVector;
255
256 __m512 aVal, bVal, cVal;
257 for (; number < sixteenthPoints; number++) {
258
259 aVal = _mm512_load_ps(aPtr);
260 bVal = _mm512_load_ps(bPtr);
261
262 cVal = _mm512_mul_ps(aVal, bVal);
263
264 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
265
266 aPtr += 16;
267 bPtr += 16;
268 cPtr += 16;
269 }
270
271 number = sixteenthPoints * 16;
272 for (; number < num_points; number++) {
273 *cPtr++ = (*aPtr++) * (*bPtr++);
274 }
275 }
276 #endif /* LV_HAVE_AVX512F */
277
278
279 #ifdef LV_HAVE_AVX
280 #include <immintrin.h>
281
282 2 static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
283 const float* aVector,
284 const float* bVector,
285 unsigned int num_points)
286 {
287 2 unsigned int number = 0;
288 2 const unsigned int eighthPoints = num_points / 8;
289
290 2 float* cPtr = cVector;
291 2 const float* aPtr = aVector;
292 2 const float* bPtr = bVector;
293
294 __m256 aVal, bVal, cVal;
295
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
296
297 32766 aVal = _mm256_load_ps(aPtr);
298 32766 bVal = _mm256_load_ps(bPtr);
299
300 32766 cVal = _mm256_mul_ps(aVal, bVal);
301
302 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
303
304 32766 aPtr += 8;
305 32766 bPtr += 8;
306 32766 cPtr += 8;
307 }
308
309 2 number = eighthPoints * 8;
310
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
311 14 *cPtr++ = (*aPtr++) * (*bPtr++);
312 }
313 2 }
314 #endif /* LV_HAVE_AVX */
315
316
317 #ifdef LV_HAVE_NEON
318 #include <arm_neon.h>
319
320 static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
321 const float* aVector,
322 const float* bVector,
323 unsigned int num_points)
324 {
325 const unsigned int quarter_points = num_points / 4;
326 unsigned int number;
327 float32x4_t avec, bvec, cvec;
328 for (number = 0; number < quarter_points; ++number) {
329 avec = vld1q_f32(aVector);
330 bvec = vld1q_f32(bVector);
331 cvec = vmulq_f32(avec, bvec);
332 vst1q_f32(cVector, cvec);
333 aVector += 4;
334 bVector += 4;
335 cVector += 4;
336 }
337 for (number = quarter_points * 4; number < num_points; ++number) {
338 *cVector++ = *aVector++ * *bVector++;
339 }
340 }
341 #endif /* LV_HAVE_NEON */
342
343
344 #ifdef LV_HAVE_GENERIC
345
346 2 static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
347 const float* aVector,
348 const float* bVector,
349 unsigned int num_points)
350 {
351 2 float* cPtr = cVector;
352 2 const float* aPtr = aVector;
353 2 const float* bPtr = bVector;
354 2 unsigned int number = 0;
355
356
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
357 262142 *cPtr++ = (*aPtr++) * (*bPtr++);
358 }
359 2 }
360 #endif /* LV_HAVE_GENERIC */
361
362
363 #ifdef LV_HAVE_ORC
364 extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
365 const float* aVector,
366 const float* bVector,
367 unsigned int num_points);
368
369 2 static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
370 const float* aVector,
371 const float* bVector,
372 unsigned int num_points)
373 {
374 2 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
375 2 }
376 #endif /* LV_HAVE_ORC */
377
378
379 #endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
380