GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32fc_32f_multiply_32fc.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 61 61 100.0%
Functions: 4 4 100.0%
Branches: 10 10 100.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32fc_32f_multiply_32fc
12 *
13 * \b Overview
14 *
15 * Multiplies a complex vector by a floating point vector and returns
16 * the complex result.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
21 * float* bVector, unsigned int num_points); \endcode
22 *
23 * \b Inputs
24 * \li aVector: The input vector of complex floats.
25 * \li bVector: The input vector of floats.
26 * \li num_points: The number of data points.
27 *
28 * \b Outputs
29 * \li outputVector: The output vector complex floats.
30 *
31 * \b Example
32 * \code
33 * int N = 10000;
34 *
35 * volk_32fc_32f_multiply_32fc();
36 *
37 * volk_free(x);
38 * volk_free(t);
39 * \endcode
40 */
41
42 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
43 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
44
45 #include <inttypes.h>
46 #include <stdio.h>
47
48 #ifdef LV_HAVE_AVX
49 #include <immintrin.h>
50
51 2 static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
52 const lv_32fc_t* aVector,
53 const float* bVector,
54 unsigned int num_points)
55 {
56 2 unsigned int number = 0;
57 2 const unsigned int eighthPoints = num_points / 8;
58
59 2 lv_32fc_t* cPtr = cVector;
60 2 const lv_32fc_t* aPtr = aVector;
61 2 const float* bPtr = bVector;
62
63 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
64
65 2 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
66
67
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < eighthPoints; number++) {
68
69 32766 aVal1 = _mm256_load_ps((float*)aPtr);
70 32766 aPtr += 4;
71
72 32766 aVal2 = _mm256_load_ps((float*)aPtr);
73 32766 aPtr += 4;
74
75 32766 bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
76 32766 bPtr += 8;
77
78 32766 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
79 32766 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
80
81 32766 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
82 32766 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
83
84 32766 cVal1 = _mm256_mul_ps(aVal1, bVal1);
85 32766 cVal2 = _mm256_mul_ps(aVal2, bVal2);
86
87 _mm256_store_ps((float*)cPtr,
88 cVal1); // Store the results back into the C container
89 32766 cPtr += 4;
90
91 _mm256_store_ps((float*)cPtr,
92 cVal2); // Store the results back into the C container
93 32766 cPtr += 4;
94 }
95
96 2 number = eighthPoints * 8;
97
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; ++number) {
98 14 *cPtr++ = (*aPtr++) * (*bPtr++);
99 }
100 2 }
101 #endif /* LV_HAVE_AVX */
102
103
104 #ifdef LV_HAVE_SSE
105 #include <xmmintrin.h>
106
107 2 static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
108 const lv_32fc_t* aVector,
109 const float* bVector,
110 unsigned int num_points)
111 {
112 2 unsigned int number = 0;
113 2 const unsigned int quarterPoints = num_points / 4;
114
115 2 lv_32fc_t* cPtr = cVector;
116 2 const lv_32fc_t* aPtr = aVector;
117 2 const float* bPtr = bVector;
118
119 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
120
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
121
122 65534 aVal1 = _mm_load_ps((const float*)aPtr);
123 65534 aPtr += 2;
124
125 65534 aVal2 = _mm_load_ps((const float*)aPtr);
126 65534 aPtr += 2;
127
128 65534 bVal = _mm_load_ps(bPtr);
129 65534 bPtr += 4;
130
131 65534 bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
132 65534 bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
133
134 65534 cVal = _mm_mul_ps(aVal1, bVal1);
135
136 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
137 65534 cPtr += 2;
138
139 65534 cVal = _mm_mul_ps(aVal2, bVal2);
140
141 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
142
143 65534 cPtr += 2;
144 }
145
146 2 number = quarterPoints * 4;
147
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
148 6 *cPtr++ = (*aPtr++) * (*bPtr);
149 6 bPtr++;
150 }
151 2 }
152 #endif /* LV_HAVE_SSE */
153
154
155 #ifdef LV_HAVE_GENERIC
156
157 2 static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
158 const lv_32fc_t* aVector,
159 const float* bVector,
160 unsigned int num_points)
161 {
162 2 lv_32fc_t* cPtr = cVector;
163 2 const lv_32fc_t* aPtr = aVector;
164 2 const float* bPtr = bVector;
165 2 unsigned int number = 0;
166
167
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
168 262142 *cPtr++ = (*aPtr++) * (*bPtr++);
169 }
170 2 }
171 #endif /* LV_HAVE_GENERIC */
172
173
174 #ifdef LV_HAVE_NEON
175 #include <arm_neon.h>
176
177 static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
178 const lv_32fc_t* aVector,
179 const float* bVector,
180 unsigned int num_points)
181 {
182 lv_32fc_t* cPtr = cVector;
183 const lv_32fc_t* aPtr = aVector;
184 const float* bPtr = bVector;
185 unsigned int number = 0;
186 unsigned int quarter_points = num_points / 4;
187
188 float32x4x2_t inputVector, outputVector;
189 float32x4_t tapsVector;
190 for (number = 0; number < quarter_points; number++) {
191 inputVector = vld2q_f32((float*)aPtr);
192 tapsVector = vld1q_f32(bPtr);
193
194 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
195 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
196
197 vst2q_f32((float*)cPtr, outputVector);
198 aPtr += 4;
199 bPtr += 4;
200 cPtr += 4;
201 }
202
203 for (number = quarter_points * 4; number < num_points; number++) {
204 *cPtr++ = (*aPtr++) * (*bPtr++);
205 }
206 }
207 #endif /* LV_HAVE_NEON */
208
209
210 #ifdef LV_HAVE_ORC
211
212 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
213 const lv_32fc_t* aVector,
214 const float* bVector,
215 unsigned int num_points);
216
217 2 static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
218 const lv_32fc_t* aVector,
219 const float* bVector,
220 unsigned int num_points)
221 {
222 2 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
223 2 }
224
225 #endif /* LV_HAVE_GENERIC */
226
227
228 #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
229