GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_s32f_add_32f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 68 68 100.0%
Functions: 6 6 100.0%
Branches: 10 10 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_s32f_add_32f
12 *
13 * \b Overview
14 *
15 * Adds a floating point scalar to a floating point vector.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32f_s32f_add_32f(float* cVector, const float* aVector, const float scalar,
20 * unsigned int num_points) \endcode
21 *
22 * \b Inputs
23 * \li aVector: The input vector of floats.
24 * \li scalar: the scalar value to add against \p aVector.
25 * \li num_points: The number of data points.
26 *
27 * \b Outputs
28 * \li cVector: The output vector of floats.
29 *
30 * \b Example
31 * \code
32 * int N = 10;
33 * unsigned int alignment = volk_get_alignment();
34 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
35 * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
36 *
37 * for(unsigned int ii = 0; ii < N; ++ii){
38 * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
39 * }
40 *
41 * // Add addshift to each entry.
42 * float addshift = 5.0f;
43 *
44 * volk_32f_s32f_add_32f(out, increasing, addshift, N);
45 *
46 * for(unsigned int ii = 0; ii < N; ++ii){
47 * printf("out[%u] = %f\n", ii, out[ii]);
48 * }
49 *
50 * volk_free(increasing);
51 * volk_free(out);
52 * \endcode
53 */
54
55 #include <inttypes.h>
56 #include <stdio.h>
57
58 #ifndef INCLUDED_volk_32f_s32f_add_32f_u_H
59 #define INCLUDED_volk_32f_s32f_add_32f_u_H
60
61 #ifdef LV_HAVE_GENERIC
62
63 10 static inline void volk_32f_s32f_add_32f_generic(float* cVector,
64 const float* aVector,
65 const float scalar,
66 unsigned int num_points)
67 {
68 10 unsigned int number = 0;
69 10 const float* inputPtr = aVector;
70 10 float* outputPtr = cVector;
71
2/2
✓ Branch 0 taken 262182 times.
✓ Branch 1 taken 10 times.
262192 for (number = 0; number < num_points; number++) {
72 262182 *outputPtr = (*inputPtr) + scalar;
73 262182 inputPtr++;
74 262182 outputPtr++;
75 }
76 10 }
77
78 #endif /* LV_HAVE_GENERIC */
79 #ifdef LV_HAVE_SSE
80 #include <xmmintrin.h>
81
82 2 static inline void volk_32f_s32f_add_32f_u_sse(float* cVector,
83 const float* aVector,
84 const float scalar,
85 unsigned int num_points)
86 {
87 2 unsigned int number = 0;
88 2 const unsigned int quarterPoints = num_points / 4;
89
90 2 float* cPtr = cVector;
91 2 const float* aPtr = aVector;
92
93 __m128 aVal, bVal, cVal;
94 2 bVal = _mm_set_ps1(scalar);
95
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
96 65534 aVal = _mm_loadu_ps(aPtr);
97
98 65534 cVal = _mm_add_ps(aVal, bVal);
99
100 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
101
102 65534 aPtr += 4;
103 65534 cPtr += 4;
104 }
105
106 2 number = quarterPoints * 4;
107 2 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
108 2 }
109 #endif /* LV_HAVE_SSE */
110
111 #ifdef LV_HAVE_AVX
112 #include <immintrin.h>
113
114 2 static inline void volk_32f_s32f_add_32f_u_avx(float* cVector,
115 const float* aVector,
116 const float scalar,
117 unsigned int num_points)
118 {
119 2 unsigned int number = 0;
120 2 const unsigned int eighthPoints = num_points / 8;
121
122 2 float* cPtr = cVector;
123 2 const float* aPtr = aVector;
124
125 __m256 aVal, bVal, cVal;
126 2 bVal = _mm256_set1_ps(scalar);
127
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
128
129 32766 aVal = _mm256_loadu_ps(aPtr);
130
131 32766 cVal = _mm256_add_ps(aVal, bVal);
132
133 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
134
135 32766 aPtr += 8;
136 32766 cPtr += 8;
137 }
138
139 2 number = eighthPoints * 8;
140 2 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
141 2 }
142 #endif /* LV_HAVE_AVX */
143
144 #ifdef LV_HAVE_NEON
145 #include <arm_neon.h>
146
147 static inline void volk_32f_s32f_add_32f_u_neon(float* cVector,
148 const float* aVector,
149 const float scalar,
150 unsigned int num_points)
151 {
152 unsigned int number = 0;
153 const float* inputPtr = aVector;
154 float* outputPtr = cVector;
155 const unsigned int quarterPoints = num_points / 4;
156
157 float32x4_t aVal, cVal, scalarvec;
158
159 scalarvec = vdupq_n_f32(scalar);
160
161 for (number = 0; number < quarterPoints; number++) {
162 aVal = vld1q_f32(inputPtr); // Load into NEON regs
163 cVal = vaddq_f32(aVal, scalarvec); // Do the add
164 vst1q_f32(outputPtr, cVal); // Store results back to output
165 inputPtr += 4;
166 outputPtr += 4;
167 }
168
169 number = quarterPoints * 4;
170 volk_32f_s32f_add_32f_generic(outputPtr, inputPtr, scalar, num_points - number);
171 }
172 #endif /* LV_HAVE_NEON */
173
174
175 #endif /* INCLUDED_volk_32f_s32f_add_32f_u_H */
176
177
178 #ifndef INCLUDED_volk_32f_s32f_add_32f_a_H
179 #define INCLUDED_volk_32f_s32f_add_32f_a_H
180
181 #ifdef LV_HAVE_SSE
182 #include <xmmintrin.h>
183
184 2 static inline void volk_32f_s32f_add_32f_a_sse(float* cVector,
185 const float* aVector,
186 const float scalar,
187 unsigned int num_points)
188 {
189 2 unsigned int number = 0;
190 2 const unsigned int quarterPoints = num_points / 4;
191
192 2 float* cPtr = cVector;
193 2 const float* aPtr = aVector;
194
195 __m128 aVal, bVal, cVal;
196 2 bVal = _mm_set_ps1(scalar);
197
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
198 65534 aVal = _mm_load_ps(aPtr);
199
200 65534 cVal = _mm_add_ps(aVal, bVal);
201
202 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
203
204 65534 aPtr += 4;
205 65534 cPtr += 4;
206 }
207
208 2 number = quarterPoints * 4;
209 2 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
210 2 }
211 #endif /* LV_HAVE_SSE */
212
213 #ifdef LV_HAVE_AVX
214 #include <immintrin.h>
215
216 2 static inline void volk_32f_s32f_add_32f_a_avx(float* cVector,
217 const float* aVector,
218 const float scalar,
219 unsigned int num_points)
220 {
221 2 unsigned int number = 0;
222 2 const unsigned int eighthPoints = num_points / 8;
223
224 2 float* cPtr = cVector;
225 2 const float* aPtr = aVector;
226
227 __m256 aVal, bVal, cVal;
228 2 bVal = _mm256_set1_ps(scalar);
229
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
230 32766 aVal = _mm256_load_ps(aPtr);
231
232 32766 cVal = _mm256_add_ps(aVal, bVal);
233
234 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
235
236 32766 aPtr += 8;
237 32766 cPtr += 8;
238 }
239
240 2 number = eighthPoints * 8;
241 2 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
242 2 }
243 #endif /* LV_HAVE_AVX */
244
245 #ifdef LV_HAVE_ORC
246
247 extern void volk_32f_s32f_add_32f_a_orc_impl(float* dst,
248 const float* src,
249 const float scalar,
250 unsigned int num_points);
251
252 2 static inline void volk_32f_s32f_add_32f_u_orc(float* cVector,
253 const float* aVector,
254 const float scalar,
255 unsigned int num_points)
256 {
257 2 volk_32f_s32f_add_32f_a_orc_impl(cVector, aVector, scalar, num_points);
258 2 }
259 #endif /* LV_HAVE_ORC */
260
261 #endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */
262