GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_32i_x2_and_32i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 62 96 64.6%
Functions: 5 7 71.4%
Branches: 14 22 63.6%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_32i_x2_and_32i
12 *
13 * \b Overview
14 *
15 * Computes the Boolean AND operation between two input 32-bit integer vectors.
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
20 * bVector, unsigned int num_points) \endcode
21 *
22 * \b Inputs
23 * \li aVector: Input vector of samples.
24 * \li bVector: Input vector of samples.
25 * \li num_points: The number of values.
26 *
27 * \b Outputs
28 * \li cVector: The output vector.
29 *
30 * \b Example
31 * This example generates a Karnaugh map for the lower two bits of x AND y.
32 * \code
33 * int N = 1<<4;
34 * unsigned int alignment = volk_get_alignment();
35 *
36 * int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
37 * int32_t* y = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
38 * int32_t* z = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
39 * int32_t in_seq[] = {0,1,3,2};
40 * unsigned int jj=0;
41 * for(unsigned int ii=0; ii<N; ++ii){
42 * x[ii] = in_seq[ii%4];
43 * y[ii] = in_seq[jj];
44 * if(((ii+1) % 4) == 0) jj++;
45 * }
46 *
47 * volk_32i_x2_and_32i(z, x, y, N);
48 *
49 * printf("Karnaugh map for x AND y\n");
50 * printf("y\\x|");
51 * for(unsigned int ii=0; ii<4; ++ii){
52 * printf(" %.2x ", in_seq[ii]);
53 * }
54 * printf("\n---|---------------\n");
55 * jj = 0;
56 * for(unsigned int ii=0; ii<N; ++ii){
57 * if(((ii+1) % 4) == 1){
58 * printf("%.2x | ", in_seq[jj++]);
59 * }
60 * printf("%.2x ", z[ii]);
61 * if(!((ii+1) % 4)){
62 * printf("\n");
63 * }
64 * }
65 * \endcode
66 */
67
68 #ifndef INCLUDED_volk_32i_x2_and_32i_a_H
69 #define INCLUDED_volk_32i_x2_and_32i_a_H
70
71 #include <inttypes.h>
72 #include <stdio.h>
73
74 #ifdef LV_HAVE_AVX512F
75 #include <immintrin.h>
76
77 static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
78 const int32_t* aVector,
79 const int32_t* bVector,
80 unsigned int num_points)
81 {
82 unsigned int number = 0;
83 const unsigned int sixteenthPoints = num_points / 16;
84
85 int32_t* cPtr = (int32_t*)cVector;
86 const int32_t* aPtr = (int32_t*)aVector;
87 const int32_t* bPtr = (int32_t*)bVector;
88
89 __m512i aVal, bVal, cVal;
90 for (; number < sixteenthPoints; number++) {
91
92 aVal = _mm512_load_si512(aPtr);
93 bVal = _mm512_load_si512(bPtr);
94
95 cVal = _mm512_and_si512(aVal, bVal);
96
97 _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98
99 aPtr += 16;
100 bPtr += 16;
101 cPtr += 16;
102 }
103
104 number = sixteenthPoints * 16;
105 for (; number < num_points; number++) {
106 cVector[number] = aVector[number] & bVector[number];
107 }
108 }
109 #endif /* LV_HAVE_AVX512F */
110
111 #ifdef LV_HAVE_AVX2
112 #include <immintrin.h>
113
114 2 static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
115 const int32_t* aVector,
116 const int32_t* bVector,
117 unsigned int num_points)
118 {
119 2 unsigned int number = 0;
120 2 const unsigned int oneEightPoints = num_points / 8;
121
122 2 int32_t* cPtr = cVector;
123 2 const int32_t* aPtr = aVector;
124 2 const int32_t* bPtr = bVector;
125
126 __m256i aVal, bVal, cVal;
127
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < oneEightPoints; number++) {
128
129 32766 aVal = _mm256_load_si256((__m256i*)aPtr);
130 32766 bVal = _mm256_load_si256((__m256i*)bPtr);
131
132 32766 cVal = _mm256_and_si256(aVal, bVal);
133
134 _mm256_store_si256((__m256i*)cPtr,
135 cVal); // Store the results back into the C container
136
137 32766 aPtr += 8;
138 32766 bPtr += 8;
139 32766 cPtr += 8;
140 }
141
142 2 number = oneEightPoints * 8;
143
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
144 14 cVector[number] = aVector[number] & bVector[number];
145 }
146 2 }
147 #endif /* LV_HAVE_AVX2 */
148
149
150 #ifdef LV_HAVE_SSE
151 #include <xmmintrin.h>
152
153 2 static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
154 const int32_t* aVector,
155 const int32_t* bVector,
156 unsigned int num_points)
157 {
158 2 unsigned int number = 0;
159 2 const unsigned int quarterPoints = num_points / 4;
160
161 2 float* cPtr = (float*)cVector;
162 2 const float* aPtr = (float*)aVector;
163 2 const float* bPtr = (float*)bVector;
164
165 __m128 aVal, bVal, cVal;
166
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (; number < quarterPoints; number++) {
167
168 65534 aVal = _mm_load_ps(aPtr);
169 65534 bVal = _mm_load_ps(bPtr);
170
171 65534 cVal = _mm_and_ps(aVal, bVal);
172
173 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
174
175 65534 aPtr += 4;
176 65534 bPtr += 4;
177 65534 cPtr += 4;
178 }
179
180 2 number = quarterPoints * 4;
181
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
8 for (; number < num_points; number++) {
182 6 cVector[number] = aVector[number] & bVector[number];
183 }
184 2 }
185 #endif /* LV_HAVE_SSE */
186
187
188 #ifdef LV_HAVE_NEON
189 #include <arm_neon.h>
190
191 static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
192 const int32_t* aVector,
193 const int32_t* bVector,
194 unsigned int num_points)
195 {
196 int32_t* cPtr = cVector;
197 const int32_t* aPtr = aVector;
198 const int32_t* bPtr = bVector;
199 unsigned int number = 0;
200 unsigned int quarter_points = num_points / 4;
201
202 int32x4_t a_val, b_val, c_val;
203
204 for (number = 0; number < quarter_points; number++) {
205 a_val = vld1q_s32(aPtr);
206 b_val = vld1q_s32(bPtr);
207 c_val = vandq_s32(a_val, b_val);
208 vst1q_s32(cPtr, c_val);
209 aPtr += 4;
210 bPtr += 4;
211 cPtr += 4;
212 }
213
214 for (number = quarter_points * 4; number < num_points; number++) {
215 *cPtr++ = (*aPtr++) & (*bPtr++);
216 }
217 }
218 #endif /* LV_HAVE_NEON */
219
220
221 #ifdef LV_HAVE_GENERIC
222
223 2 static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
224 const int32_t* aVector,
225 const int32_t* bVector,
226 unsigned int num_points)
227 {
228 2 int32_t* cPtr = cVector;
229 2 const int32_t* aPtr = aVector;
230 2 const int32_t* bPtr = bVector;
231 2 unsigned int number = 0;
232
233
2/2
✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
262144 for (number = 0; number < num_points; number++) {
234 262142 *cPtr++ = (*aPtr++) & (*bPtr++);
235 }
236 2 }
237 #endif /* LV_HAVE_GENERIC */
238
239
240 #ifdef LV_HAVE_ORC
241 extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
242 const int32_t* aVector,
243 const int32_t* bVector,
244 unsigned int num_points);
245
246 2 static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
247 const int32_t* aVector,
248 const int32_t* bVector,
249 unsigned int num_points)
250 {
251 2 volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
252 2 }
253 #endif /* LV_HAVE_ORC */
254
255
256 #endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
257
258
259 #ifndef INCLUDED_volk_32i_x2_and_32i_u_H
260 #define INCLUDED_volk_32i_x2_and_32i_u_H
261
262 #include <inttypes.h>
263 #include <stdio.h>
264
265 #ifdef LV_HAVE_AVX512F
266 #include <immintrin.h>
267
268 static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
269 const int32_t* aVector,
270 const int32_t* bVector,
271 unsigned int num_points)
272 {
273 unsigned int number = 0;
274 const unsigned int sixteenthPoints = num_points / 16;
275
276 int32_t* cPtr = (int32_t*)cVector;
277 const int32_t* aPtr = (int32_t*)aVector;
278 const int32_t* bPtr = (int32_t*)bVector;
279
280 __m512i aVal, bVal, cVal;
281 for (; number < sixteenthPoints; number++) {
282
283 aVal = _mm512_loadu_si512(aPtr);
284 bVal = _mm512_loadu_si512(bPtr);
285
286 cVal = _mm512_and_si512(aVal, bVal);
287
288 _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
289
290 aPtr += 16;
291 bPtr += 16;
292 cPtr += 16;
293 }
294
295 number = sixteenthPoints * 16;
296 for (; number < num_points; number++) {
297 cVector[number] = aVector[number] & bVector[number];
298 }
299 }
300 #endif /* LV_HAVE_AVX512F */
301
302 #ifdef LV_HAVE_AVX2
303 #include <immintrin.h>
304
305 2 static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
306 const int32_t* aVector,
307 const int32_t* bVector,
308 unsigned int num_points)
309 {
310 2 unsigned int number = 0;
311 2 const unsigned int oneEightPoints = num_points / 8;
312
313 2 int32_t* cPtr = cVector;
314 2 const int32_t* aPtr = aVector;
315 2 const int32_t* bPtr = bVector;
316
317 __m256i aVal, bVal, cVal;
318
2/2
✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
32768 for (; number < oneEightPoints; number++) {
319
320 32766 aVal = _mm256_loadu_si256((__m256i*)aPtr);
321 32766 bVal = _mm256_loadu_si256((__m256i*)bPtr);
322
323 32766 cVal = _mm256_and_si256(aVal, bVal);
324
325 _mm256_storeu_si256((__m256i*)cPtr,
326 cVal); // Store the results back into the C container
327
328 32766 aPtr += 8;
329 32766 bPtr += 8;
330 32766 cPtr += 8;
331 }
332
333 2 number = oneEightPoints * 8;
334
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
16 for (; number < num_points; number++) {
335 14 cVector[number] = aVector[number] & bVector[number];
336 }
337 2 }
338 #endif /* LV_HAVE_AVX2 */
339
340
341 #endif /* INCLUDED_volk_32i_x2_and_32i_u_H */
342