GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32f_64f_add_64f.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 56 56 100.0%
Functions: 3 3 100.0%
Branches: 10 10 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32f_64f_add_64f
12 *
13 * \b Overview
14 *
15 * Adds two input vectors and store result as a double-precision vectors. One
16 * of the input vector is defined as a single precision floating point, so
17 * upcasting is performed before the addition
18 *
19 * c[i] = a[i] + b[i]
20 *
21 * <b>Dispatcher Prototype</b>
22 * \code
23 * void volk_32f_64f_add_64f(double* cVector, const double* aVector, const
24 * double* bVector, unsigned int num_points) \endcode
25 *
26 * \b Inputs
27 * \li aVector: First input vector.
28 * \li bVector: Second input vector.
29 * \li num_points: The number of values in both input vectors.
30 *
31 * \b Outputs
32 * \li cVector: The output vector.
33 *
34 * \b Example
35 * add elements of an increasing vector by those of a decreasing vector.
36 * \code
37 * int N = 10;
38 * unsigned int alignment = volk_get_alignment();
39 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
40 * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
41 * double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
42 *
43 * for(unsigned int ii = 0; ii < N; ++ii){
44 * increasing[ii] = (double)ii;
45 * decreasing[ii] = 10.f - (double)ii;
46 * }
47 *
48 * volk_32f_64f_add_64f(out, increasing, decreasing, N);
49 *
50 * for(unsigned int ii = 0; ii < N; ++ii){
51 * printf("out[%u] = %1.2F\n", ii, out[ii]);
52 * }
53 *
54 * volk_free(increasing);
55 * volk_free(decreasing);
56 * volk_free(out);
57 * \endcode
58 */
59
60 #ifndef INCLUDED_volk_32f_64f_add_64f_H
61 #define INCLUDED_volk_32f_64f_add_64f_H
62
63 #include <inttypes.h>
64
65 #ifdef LV_HAVE_GENERIC
66
67 2 static inline void volk_32f_64f_add_64f_generic(double* cVector,
68 const float* aVector,
69 const double* bVector,
70 unsigned int num_points)
71 {
72 2 double* cPtr = cVector;
73 2 const float* aPtr = aVector;
74 2 const double* bPtr = bVector;
75 2 unsigned int number = 0;
76
77
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
78 262142 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
79 }
80 2 }
81
82 #endif /* LV_HAVE_GENERIC */
83
84 #ifdef LV_HAVE_NEONV8
85 #include <arm_neon.h>
86
87 static inline void volk_32f_64f_add_64f_neon(double* cVector,
88 const float* aVector,
89 const double* bVector,
90 unsigned int num_points)
91 {
92 unsigned int number = 0;
93 const unsigned int half_points = num_points / 2;
94
95 double* cPtr = cVector;
96 const float* aPtr = aVector;
97 const double* bPtr = bVector;
98
99 float64x2_t aVal, bVal, cVal;
100 float32x2_t aVal1;
101 for (number = 0; number < half_points; number++) {
102 // Load in to NEON registers
103 aVal1 = vld1_f32(aPtr);
104 bVal = vld1q_f64(bPtr);
105 __VOLK_PREFETCH(aPtr + 2);
106 __VOLK_PREFETCH(bPtr + 2);
107 aPtr += 2; // q uses quadwords, 4 floats per vadd
108 bPtr += 2;
109
110 // Vector conversion
111 aVal = vcvt_f64_f32(aVal1);
112 // vector add
113 cVal = vaddq_f64(aVal, bVal);
114 // Store the results back into the C container
115 vst1q_f64(cPtr, cVal);
116
117 cPtr += 2;
118 }
119
120 number = half_points * 2; // should be = num_points
121 for (; number < num_points; number++) {
122 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
123 }
124 }
125
126 #endif /* LV_HAVE_NEONV8 */
127
128 #ifdef LV_HAVE_AVX
129
130 #include <immintrin.h>
131 #include <xmmintrin.h>
132
133 2 static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
134 const float* aVector,
135 const double* bVector,
136 unsigned int num_points)
137 {
138 2 unsigned int number = 0;
139 2 const unsigned int eighth_points = num_points / 8;
140
141 2 double* cPtr = cVector;
142 2 const float* aPtr = aVector;
143 2 const double* bPtr = bVector;
144
145 __m256 aVal;
146 __m128 aVal1, aVal2;
147 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
148
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighth_points; number++) {
149
150 32766 aVal = _mm256_loadu_ps(aPtr);
151 32766 bVal1 = _mm256_loadu_pd(bPtr);
152 32766 bVal2 = _mm256_loadu_pd(bPtr + 4);
153
154 32766 aVal1 = _mm256_extractf128_ps(aVal, 0);
155 32766 aVal2 = _mm256_extractf128_ps(aVal, 1);
156
157 32766 aDbl1 = _mm256_cvtps_pd(aVal1);
158 32766 aDbl2 = _mm256_cvtps_pd(aVal2);
159
160 32766 cVal1 = _mm256_add_pd(aDbl1, bVal1);
161 32766 cVal2 = _mm256_add_pd(aDbl2, bVal2);
162
163 _mm256_storeu_pd(cPtr,
164 cVal1); // Store the results back into the C container
165 32766 _mm256_storeu_pd(cPtr + 4,
166 cVal2); // Store the results back into the C container
167
168 32766 aPtr += 8;
169 32766 bPtr += 8;
170 32766 cPtr += 8;
171 }
172
173 2 number = eighth_points * 8;
174
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
175 14 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
176 }
177 2 }
178
179 #endif /* LV_HAVE_AVX */
180
181 #ifdef LV_HAVE_AVX
182
183 #include <immintrin.h>
184 #include <xmmintrin.h>
185
186 2 static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
187 const float* aVector,
188 const double* bVector,
189 unsigned int num_points)
190 {
191 2 unsigned int number = 0;
192 2 const unsigned int eighth_points = num_points / 8;
193
194 2 double* cPtr = cVector;
195 2 const float* aPtr = aVector;
196 2 const double* bPtr = bVector;
197
198 __m256 aVal;
199 __m128 aVal1, aVal2;
200 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
201
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighth_points; number++) {
202
203 32766 aVal = _mm256_load_ps(aPtr);
204 32766 bVal1 = _mm256_load_pd(bPtr);
205 32766 bVal2 = _mm256_load_pd(bPtr + 4);
206
207 32766 aVal1 = _mm256_extractf128_ps(aVal, 0);
208 32766 aVal2 = _mm256_extractf128_ps(aVal, 1);
209
210 32766 aDbl1 = _mm256_cvtps_pd(aVal1);
211 32766 aDbl2 = _mm256_cvtps_pd(aVal2);
212
213 32766 cVal1 = _mm256_add_pd(aDbl1, bVal1);
214 32766 cVal2 = _mm256_add_pd(aDbl2, bVal2);
215
216 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
217 32766 _mm256_store_pd(cPtr + 4,
218 cVal2); // Store the results back into the C container
219
220 32766 aPtr += 8;
221 32766 bPtr += 8;
222 32766 cPtr += 8;
223 }
224
225 2 number = eighth_points * 8;
226
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
227 14 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
228 }
229 2 }
230
231 #endif /* LV_HAVE_AVX */
232
233 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
234