GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_32f_add_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 58 58 100.0%
Functions: 3 3 100.0%
Branches: 10 10 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_32f_add_32fc
12 *
13 * \b Overview
14 *
15 * Adds two vectors together element by element:
16 *
17 * c[i] = a[i] + b[i]
18 *
19 * <b>Dispatcher Prototype</b>
20 * \code
21 * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float*
22 * bVector, unsigned int num_points) \endcode
23 *
24 * \b Inputs
25 * \li aVector: First vector of input points.
26 * \li bVector: Second vector of input points.
27 * \li num_points: The number of values in both input vector.
28 *
29 * \b Outputs
30 * \li cVector: The output vector.
31 *
32 * \b Example
33 *
34 * The follow example adds the increasing and decreasing vectors such that the result of
35 * every summation pair is 10
36 *
37 * \code
38 * int N = 10;
39 * unsigned int alignment = volk_get_alignment();
40 * lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
41 * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
42 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
43 *
44 * for(unsigned int ii = 0; ii < N; ++ii){
45 * increasing[ii] = (lv_32fc_t)ii;
46 * decreasing[ii] = 10.f - (float)ii;
47 * }
48 *
49 * volk_32fc_32f_add_32fc(out, increasing, decreasing, N);
50 *
51 * for(unsigned int ii = 0; ii < N; ++ii){
52 * printf("out[%u] = %1.2f\n", ii, out[ii]);
53 * }
54 *
55 * volk_free(increasing);
56 * volk_free(decreasing);
57 * volk_free(out);
58 * \endcode
59 */
60
61 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
62 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
63
64 #ifdef LV_HAVE_GENERIC
65
66 2 static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
67 const lv_32fc_t* aVector,
68 const float* bVector,
69 unsigned int num_points)
70 {
71 2 lv_32fc_t* cPtr = cVector;
72 2 const lv_32fc_t* aPtr = aVector;
73 2 const float* bPtr = bVector;
74 2 unsigned int number = 0;
75
76
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
77 262142 *cPtr++ = (*aPtr++) + (*bPtr++);
78 }
79 2 }
80 #endif /* LV_HAVE_GENERIC */
81
82
83 #ifdef LV_HAVE_AVX
84 #include <immintrin.h>
85
86 2 static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
87 const lv_32fc_t* aVector,
88 const float* bVector,
89 unsigned int num_points)
90 {
91 2 unsigned int number = 0;
92 2 const unsigned int eighthPoints = num_points / 8;
93
94 2 lv_32fc_t* cPtr = cVector;
95 2 const lv_32fc_t* aPtr = aVector;
96 2 const float* bPtr = bVector;
97
98 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
99 __m256 cpx_b1, cpx_b2;
100 __m256 zero;
101 2 zero = _mm256_setzero_ps();
102 __m256 tmp1, tmp2;
103
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
104
105 32766 aVal1 = _mm256_loadu_ps((float*)aPtr);
106 65532 aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
107 32766 bVal = _mm256_loadu_ps(bPtr);
108 32766 cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
109 32766 cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
110
111 32766 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
112 32766 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
113
114 32766 cVal1 = _mm256_add_ps(aVal1, tmp1);
115 32766 cVal2 = _mm256_add_ps(aVal2, tmp2);
116
117 _mm256_storeu_ps((float*)cPtr,
118 cVal1); // Store the results back into the C container
119 32766 _mm256_storeu_ps((float*)(cPtr + 4),
120 cVal2); // Store the results back into the C container
121
122 32766 aPtr += 8;
123 32766 bPtr += 8;
124 32766 cPtr += 8;
125 }
126
127 2 number = eighthPoints * 8;
128
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
129 14 *cPtr++ = (*aPtr++) + (*bPtr++);
130 }
131 2 }
132 #endif /* LV_HAVE_AVX */
133
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
136
137 2 static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
138 const lv_32fc_t* aVector,
139 const float* bVector,
140 unsigned int num_points)
141 {
142 2 unsigned int number = 0;
143 2 const unsigned int eighthPoints = num_points / 8;
144
145 2 lv_32fc_t* cPtr = cVector;
146 2 const lv_32fc_t* aPtr = aVector;
147 2 const float* bPtr = bVector;
148
149 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
150 __m256 cpx_b1, cpx_b2;
151 __m256 zero;
152 2 zero = _mm256_setzero_ps();
153 __m256 tmp1, tmp2;
154
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
155
156 32766 aVal1 = _mm256_load_ps((float*)aPtr);
157 65532 aVal2 = _mm256_load_ps((float*)(aPtr + 4));
158 32766 bVal = _mm256_load_ps(bPtr);
159 32766 cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
160 32766 cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
161
162 32766 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
163 32766 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
164
165 32766 cVal1 = _mm256_add_ps(aVal1, tmp1);
166 32766 cVal2 = _mm256_add_ps(aVal2, tmp2);
167
168 _mm256_store_ps((float*)cPtr,
169 cVal1); // Store the results back into the C container
170 32766 _mm256_store_ps((float*)(cPtr + 4),
171 cVal2); // Store the results back into the C container
172
173 32766 aPtr += 8;
174 32766 bPtr += 8;
175 32766 cPtr += 8;
176 }
177
178 2 number = eighthPoints * 8;
179
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
180 14 *cPtr++ = (*aPtr++) + (*bPtr++);
181 }
182 2 }
183 #endif /* LV_HAVE_AVX */
184
185 #ifdef LV_HAVE_NEON
186 #include <arm_neon.h>
187
188 static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
189 const lv_32fc_t* aVector,
190 const float* bVector,
191 unsigned int num_points)
192 {
193 lv_32fc_t* cPtr = cVector;
194 const lv_32fc_t* aPtr = aVector;
195 const float* bPtr = bVector;
196
197 float32x4x4_t aVal0, aVal1;
198 float32x4x2_t bVal0, bVal1;
199
200 const unsigned int sixteenthPoints = num_points / 16;
201 unsigned int number = 0;
202 for (; number < sixteenthPoints; number++) {
203 aVal0 = vld4q_f32((const float*)aPtr);
204 aPtr += 8;
205 aVal1 = vld4q_f32((const float*)aPtr);
206 aPtr += 8;
207 __VOLK_PREFETCH(aPtr + 16);
208
209 bVal0 = vld2q_f32((const float*)bPtr);
210 bPtr += 8;
211 bVal1 = vld2q_f32((const float*)bPtr);
212 bPtr += 8;
213 __VOLK_PREFETCH(bPtr + 16);
214
215 aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
216 aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
217
218 aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
219 aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
220
221 vst4q_f32((float*)(cPtr), aVal0);
222 cPtr += 8;
223 vst4q_f32((float*)(cPtr), aVal1);
224 cPtr += 8;
225 }
226
227 for (number = sixteenthPoints * 16; number < num_points; number++) {
228 *cPtr++ = (*aPtr++) + (*bPtr++);
229 }
230 }
231 #endif /* LV_HAVE_NEON */
232
233
234 #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
235