Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32f_x2_dot_prod_16i | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * This block computes the dot product (or inner product) between two | ||
16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
17 | * num_points taps, the result is the sum of products between the two | ||
18 | * vectors. The result is a single value stored in the \p result | ||
19 | * address and is conerted to a fixed-point short. | ||
20 | * | ||
21 | * <b>Dispatcher Prototype</b> | ||
22 | * \code | ||
23 | * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, | ||
24 | * unsigned int num_points) \endcode | ||
25 | * | ||
26 | * \b Inputs | ||
27 | * \li input: vector of floats. | ||
28 | * \li taps: float taps. | ||
29 | * \li num_points: number of samples in both \p input and \p taps. | ||
30 | * | ||
31 | * \b Outputs | ||
32 | * \li result: pointer to a short value to hold the dot product result. | ||
33 | * | ||
34 | * \b Example | ||
35 | * \code | ||
36 | * int N = 10000; | ||
37 | * | ||
38 | * <FIXME> | ||
39 | * | ||
40 | * volk_32f_x2_dot_prod_16i(); | ||
41 | * | ||
42 | * \endcode | ||
43 | */ | ||
44 | |||
45 | #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H | ||
46 | #define INCLUDED_volk_32f_x2_dot_prod_16i_H | ||
47 | |||
48 | #include <stdio.h> | ||
49 | #include <volk/volk_common.h> | ||
50 | |||
51 | |||
52 | #ifdef LV_HAVE_GENERIC | ||
53 | |||
54 | |||
55 | 2 | static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, | |
56 | const float* input, | ||
57 | const float* taps, | ||
58 | unsigned int num_points) | ||
59 | { | ||
60 | |||
61 | 2 | float dotProduct = 0; | |
62 | 2 | const float* aPtr = input; | |
63 | 2 | const float* bPtr = taps; | |
64 | 2 | unsigned int number = 0; | |
65 | |||
66 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
67 | 262142 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
68 | } | ||
69 | |||
70 | 2 | *result = (int16_t)dotProduct; | |
71 | 2 | } | |
72 | |||
73 | #endif /*LV_HAVE_GENERIC*/ | ||
74 | |||
75 | |||
76 | #ifdef LV_HAVE_SSE | ||
77 | |||
78 | 2 | static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, | |
79 | const float* input, | ||
80 | const float* taps, | ||
81 | unsigned int num_points) | ||
82 | { | ||
83 | |||
84 | 2 | unsigned int number = 0; | |
85 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
86 | |||
87 | 2 | float dotProduct = 0; | |
88 | 2 | const float* aPtr = input; | |
89 | 2 | const float* bPtr = taps; | |
90 | |||
91 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
92 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
93 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
94 | |||
95 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
96 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
97 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
98 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
99 | |||
100 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
101 | |||
102 | 16382 | a0Val = _mm_load_ps(aPtr); | |
103 | 16382 | a1Val = _mm_load_ps(aPtr + 4); | |
104 | 16382 | a2Val = _mm_load_ps(aPtr + 8); | |
105 | 32764 | a3Val = _mm_load_ps(aPtr + 12); | |
106 | 16382 | b0Val = _mm_load_ps(bPtr); | |
107 | 16382 | b1Val = _mm_load_ps(bPtr + 4); | |
108 | 16382 | b2Val = _mm_load_ps(bPtr + 8); | |
109 | 32764 | b3Val = _mm_load_ps(bPtr + 12); | |
110 | |||
111 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
112 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
113 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
114 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
115 | |||
116 | 16382 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
117 | 16382 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
118 | 16382 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
119 | 16382 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
120 | |||
121 | 16382 | aPtr += 16; | |
122 | 16382 | bPtr += 16; | |
123 | } | ||
124 | |||
125 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
126 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
127 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
128 | |||
129 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
130 | |||
131 | _mm_store_ps(dotProductVector, | ||
132 | dotProdVal0); // Store the results back into the dot product vector | ||
133 | |||
134 | 2 | dotProduct = dotProductVector[0]; | |
135 | 2 | dotProduct += dotProductVector[1]; | |
136 | 2 | dotProduct += dotProductVector[2]; | |
137 | 2 | dotProduct += dotProductVector[3]; | |
138 | |||
139 | 2 | number = sixteenthPoints * 16; | |
140 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
141 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
142 | } | ||
143 | |||
144 | 2 | *result = (short)dotProduct; | |
145 | 2 | } | |
146 | |||
147 | #endif /*LV_HAVE_SSE*/ | ||
148 | |||
149 | |||
150 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
151 | |||
152 | 2 | static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, | |
153 | const float* input, | ||
154 | const float* taps, | ||
155 | unsigned int num_points) | ||
156 | { | ||
157 | |||
158 | 2 | unsigned int number = 0; | |
159 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
160 | |||
161 | 2 | float dotProduct = 0; | |
162 | 2 | const float* aPtr = input; | |
163 | 2 | const float* bPtr = taps; | |
164 | |||
165 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
166 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
167 | |||
168 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
169 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
170 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
171 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
172 | |||
173 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
174 | |||
175 | 8190 | a0Val = _mm256_load_ps(aPtr); | |
176 | 8190 | a1Val = _mm256_load_ps(aPtr + 8); | |
177 | 8190 | a2Val = _mm256_load_ps(aPtr + 16); | |
178 | 16380 | a3Val = _mm256_load_ps(aPtr + 24); | |
179 | 8190 | b0Val = _mm256_load_ps(bPtr); | |
180 | 8190 | b1Val = _mm256_load_ps(bPtr + 8); | |
181 | 8190 | b2Val = _mm256_load_ps(bPtr + 16); | |
182 | 16380 | b3Val = _mm256_load_ps(bPtr + 24); | |
183 | |||
184 | 8190 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
185 | 8190 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
186 | 8190 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
187 | 8190 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
188 | |||
189 | 8190 | aPtr += 32; | |
190 | 8190 | bPtr += 32; | |
191 | } | ||
192 | |||
193 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
194 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
195 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
196 | |||
197 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
198 | |||
199 | _mm256_store_ps(dotProductVector, | ||
200 | dotProdVal0); // Store the results back into the dot product vector | ||
201 | |||
202 | 2 | dotProduct = dotProductVector[0]; | |
203 | 2 | dotProduct += dotProductVector[1]; | |
204 | 2 | dotProduct += dotProductVector[2]; | |
205 | 2 | dotProduct += dotProductVector[3]; | |
206 | 2 | dotProduct += dotProductVector[4]; | |
207 | 2 | dotProduct += dotProductVector[5]; | |
208 | 2 | dotProduct += dotProductVector[6]; | |
209 | 2 | dotProduct += dotProductVector[7]; | |
210 | |||
211 | 2 | number = thirtysecondPoints * 32; | |
212 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
213 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
214 | } | ||
215 | |||
216 | 2 | *result = (short)dotProduct; | |
217 | 2 | } | |
218 | |||
219 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
220 | |||
221 | |||
222 | #ifdef LV_HAVE_AVX | ||
223 | |||
224 | 2 | static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, | |
225 | const float* input, | ||
226 | const float* taps, | ||
227 | unsigned int num_points) | ||
228 | { | ||
229 | |||
230 | 2 | unsigned int number = 0; | |
231 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
232 | |||
233 | 2 | float dotProduct = 0; | |
234 | 2 | const float* aPtr = input; | |
235 | 2 | const float* bPtr = taps; | |
236 | |||
237 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
238 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
239 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
240 | |||
241 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
242 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
243 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
244 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
245 | |||
246 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
247 | |||
248 | 8190 | a0Val = _mm256_load_ps(aPtr); | |
249 | 8190 | a1Val = _mm256_load_ps(aPtr + 8); | |
250 | 8190 | a2Val = _mm256_load_ps(aPtr + 16); | |
251 | 16380 | a3Val = _mm256_load_ps(aPtr + 24); | |
252 | 8190 | b0Val = _mm256_load_ps(bPtr); | |
253 | 8190 | b1Val = _mm256_load_ps(bPtr + 8); | |
254 | 8190 | b2Val = _mm256_load_ps(bPtr + 16); | |
255 | 16380 | b3Val = _mm256_load_ps(bPtr + 24); | |
256 | |||
257 | 8190 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
258 | 8190 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
259 | 8190 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
260 | 8190 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
261 | |||
262 | 8190 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
263 | 8190 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
264 | 8190 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
265 | 8190 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
266 | |||
267 | 8190 | aPtr += 32; | |
268 | 8190 | bPtr += 32; | |
269 | } | ||
270 | |||
271 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
272 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
273 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
274 | |||
275 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
276 | |||
277 | _mm256_store_ps(dotProductVector, | ||
278 | dotProdVal0); // Store the results back into the dot product vector | ||
279 | |||
280 | 2 | dotProduct = dotProductVector[0]; | |
281 | 2 | dotProduct += dotProductVector[1]; | |
282 | 2 | dotProduct += dotProductVector[2]; | |
283 | 2 | dotProduct += dotProductVector[3]; | |
284 | 2 | dotProduct += dotProductVector[4]; | |
285 | 2 | dotProduct += dotProductVector[5]; | |
286 | 2 | dotProduct += dotProductVector[6]; | |
287 | 2 | dotProduct += dotProductVector[7]; | |
288 | |||
289 | 2 | number = thirtysecondPoints * 32; | |
290 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
291 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
292 | } | ||
293 | |||
294 | 2 | *result = (short)dotProduct; | |
295 | 2 | } | |
296 | |||
297 | #endif /*LV_HAVE_AVX*/ | ||
298 | |||
299 | #ifdef LV_HAVE_AVX512F | ||
300 | |||
301 | ✗ | static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, | |
302 | const float* input, | ||
303 | const float* taps, | ||
304 | unsigned int num_points) | ||
305 | { | ||
306 | |||
307 | ✗ | unsigned int number = 0; | |
308 | ✗ | const unsigned int sixtyfourthPoints = num_points / 64; | |
309 | |||
310 | ✗ | float dotProduct = 0; | |
311 | ✗ | const float* aPtr = input; | |
312 | ✗ | const float* bPtr = taps; | |
313 | |||
314 | __m512 a0Val, a1Val, a2Val, a3Val; | ||
315 | __m512 b0Val, b1Val, b2Val, b3Val; | ||
316 | |||
317 | ✗ | __m512 dotProdVal0 = _mm512_setzero_ps(); | |
318 | ✗ | __m512 dotProdVal1 = _mm512_setzero_ps(); | |
319 | ✗ | __m512 dotProdVal2 = _mm512_setzero_ps(); | |
320 | ✗ | __m512 dotProdVal3 = _mm512_setzero_ps(); | |
321 | |||
322 | ✗ | for (; number < sixtyfourthPoints; number++) { | |
323 | |||
324 | ✗ | a0Val = _mm512_load_ps(aPtr); | |
325 | ✗ | a1Val = _mm512_load_ps(aPtr + 16); | |
326 | ✗ | a2Val = _mm512_load_ps(aPtr + 32); | |
327 | ✗ | a3Val = _mm512_load_ps(aPtr + 48); | |
328 | ✗ | b0Val = _mm512_load_ps(bPtr); | |
329 | ✗ | b1Val = _mm512_load_ps(bPtr + 16); | |
330 | ✗ | b2Val = _mm512_load_ps(bPtr + 32); | |
331 | ✗ | b3Val = _mm512_load_ps(bPtr + 48); | |
332 | |||
333 | ✗ | dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
334 | ✗ | dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
335 | ✗ | dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
336 | ✗ | dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
337 | |||
338 | ✗ | aPtr += 64; | |
339 | ✗ | bPtr += 64; | |
340 | } | ||
341 | |||
342 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); | |
343 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); | |
344 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); | |
345 | |||
346 | __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; | ||
347 | |||
348 | _mm512_store_ps(dotProductVector, | ||
349 | dotProdVal0); // Store the results back into the dot product vector | ||
350 | |||
351 | ✗ | dotProduct = dotProductVector[0]; | |
352 | ✗ | dotProduct += dotProductVector[1]; | |
353 | ✗ | dotProduct += dotProductVector[2]; | |
354 | ✗ | dotProduct += dotProductVector[3]; | |
355 | ✗ | dotProduct += dotProductVector[4]; | |
356 | ✗ | dotProduct += dotProductVector[5]; | |
357 | ✗ | dotProduct += dotProductVector[6]; | |
358 | ✗ | dotProduct += dotProductVector[7]; | |
359 | ✗ | dotProduct += dotProductVector[8]; | |
360 | ✗ | dotProduct += dotProductVector[9]; | |
361 | ✗ | dotProduct += dotProductVector[10]; | |
362 | ✗ | dotProduct += dotProductVector[11]; | |
363 | ✗ | dotProduct += dotProductVector[12]; | |
364 | ✗ | dotProduct += dotProductVector[13]; | |
365 | ✗ | dotProduct += dotProductVector[14]; | |
366 | ✗ | dotProduct += dotProductVector[15]; | |
367 | |||
368 | ✗ | number = sixtyfourthPoints * 64; | |
369 | ✗ | for (; number < num_points; number++) { | |
370 | ✗ | dotProduct += ((*aPtr++) * (*bPtr++)); | |
371 | } | ||
372 | |||
373 | ✗ | *result = (short)dotProduct; | |
374 | ✗ | } | |
375 | |||
376 | #endif /*LV_HAVE_AVX512F*/ | ||
377 | |||
378 | |||
379 | #ifdef LV_HAVE_SSE | ||
380 | |||
381 | 2 | static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, | |
382 | const float* input, | ||
383 | const float* taps, | ||
384 | unsigned int num_points) | ||
385 | { | ||
386 | |||
387 | 2 | unsigned int number = 0; | |
388 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
389 | |||
390 | 2 | float dotProduct = 0; | |
391 | 2 | const float* aPtr = input; | |
392 | 2 | const float* bPtr = taps; | |
393 | |||
394 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
395 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
396 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
397 | |||
398 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
399 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
400 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
401 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
402 | |||
403 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
404 | |||
405 | 16382 | a0Val = _mm_loadu_ps(aPtr); | |
406 | 16382 | a1Val = _mm_loadu_ps(aPtr + 4); | |
407 | 16382 | a2Val = _mm_loadu_ps(aPtr + 8); | |
408 | 32764 | a3Val = _mm_loadu_ps(aPtr + 12); | |
409 | 16382 | b0Val = _mm_loadu_ps(bPtr); | |
410 | 16382 | b1Val = _mm_loadu_ps(bPtr + 4); | |
411 | 16382 | b2Val = _mm_loadu_ps(bPtr + 8); | |
412 | 32764 | b3Val = _mm_loadu_ps(bPtr + 12); | |
413 | |||
414 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
415 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
416 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
417 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
418 | |||
419 | 16382 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
420 | 16382 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
421 | 16382 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
422 | 16382 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
423 | |||
424 | 16382 | aPtr += 16; | |
425 | 16382 | bPtr += 16; | |
426 | } | ||
427 | |||
428 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
429 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
430 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
431 | |||
432 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
433 | |||
434 | _mm_store_ps(dotProductVector, | ||
435 | dotProdVal0); // Store the results back into the dot product vector | ||
436 | |||
437 | 2 | dotProduct = dotProductVector[0]; | |
438 | 2 | dotProduct += dotProductVector[1]; | |
439 | 2 | dotProduct += dotProductVector[2]; | |
440 | 2 | dotProduct += dotProductVector[3]; | |
441 | |||
442 | 2 | number = sixteenthPoints * 16; | |
443 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
444 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
445 | } | ||
446 | |||
447 | 2 | *result = (short)dotProduct; | |
448 | 2 | } | |
449 | |||
450 | #endif /*LV_HAVE_SSE*/ | ||
451 | |||
452 | |||
453 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
454 | |||
455 | 2 | static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, | |
456 | const float* input, | ||
457 | const float* taps, | ||
458 | unsigned int num_points) | ||
459 | { | ||
460 | |||
461 | 2 | unsigned int number = 0; | |
462 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
463 | |||
464 | 2 | float dotProduct = 0; | |
465 | 2 | const float* aPtr = input; | |
466 | 2 | const float* bPtr = taps; | |
467 | |||
468 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
469 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
470 | |||
471 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
472 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
473 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
474 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
475 | |||
476 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
477 | |||
478 | 8190 | a0Val = _mm256_loadu_ps(aPtr); | |
479 | 8190 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
480 | 8190 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
481 | 16380 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
482 | 8190 | b0Val = _mm256_loadu_ps(bPtr); | |
483 | 8190 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
484 | 8190 | b2Val = _mm256_loadu_ps(bPtr + 16); | |
485 | 16380 | b3Val = _mm256_loadu_ps(bPtr + 24); | |
486 | |||
487 | 8190 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
488 | 8190 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
489 | 8190 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
490 | 8190 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
491 | |||
492 | 8190 | aPtr += 32; | |
493 | 8190 | bPtr += 32; | |
494 | } | ||
495 | |||
496 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
497 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
498 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
499 | |||
500 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
501 | |||
502 | _mm256_store_ps(dotProductVector, | ||
503 | dotProdVal0); // Store the results back into the dot product vector | ||
504 | |||
505 | 2 | dotProduct = dotProductVector[0]; | |
506 | 2 | dotProduct += dotProductVector[1]; | |
507 | 2 | dotProduct += dotProductVector[2]; | |
508 | 2 | dotProduct += dotProductVector[3]; | |
509 | 2 | dotProduct += dotProductVector[4]; | |
510 | 2 | dotProduct += dotProductVector[5]; | |
511 | 2 | dotProduct += dotProductVector[6]; | |
512 | 2 | dotProduct += dotProductVector[7]; | |
513 | |||
514 | 2 | number = thirtysecondPoints * 32; | |
515 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
516 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
517 | } | ||
518 | |||
519 | 2 | *result = (short)dotProduct; | |
520 | 2 | } | |
521 | |||
522 | #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ | ||
523 | |||
524 | |||
525 | #ifdef LV_HAVE_AVX | ||
526 | |||
527 | 2 | static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, | |
528 | const float* input, | ||
529 | const float* taps, | ||
530 | unsigned int num_points) | ||
531 | { | ||
532 | |||
533 | 2 | unsigned int number = 0; | |
534 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
535 | |||
536 | 2 | float dotProduct = 0; | |
537 | 2 | const float* aPtr = input; | |
538 | 2 | const float* bPtr = taps; | |
539 | |||
540 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
541 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
542 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
543 | |||
544 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
545 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
546 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
547 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
548 | |||
549 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
550 | |||
551 | 8190 | a0Val = _mm256_loadu_ps(aPtr); | |
552 | 8190 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
553 | 8190 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
554 | 16380 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
555 | 8190 | b0Val = _mm256_loadu_ps(bPtr); | |
556 | 8190 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
557 | 8190 | b2Val = _mm256_loadu_ps(bPtr + 16); | |
558 | 16380 | b3Val = _mm256_loadu_ps(bPtr + 24); | |
559 | |||
560 | 8190 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
561 | 8190 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
562 | 8190 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
563 | 8190 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
564 | |||
565 | 8190 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
566 | 8190 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
567 | 8190 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
568 | 8190 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
569 | |||
570 | 8190 | aPtr += 32; | |
571 | 8190 | bPtr += 32; | |
572 | } | ||
573 | |||
574 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
575 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
576 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
577 | |||
578 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
579 | |||
580 | _mm256_store_ps(dotProductVector, | ||
581 | dotProdVal0); // Store the results back into the dot product vector | ||
582 | |||
583 | 2 | dotProduct = dotProductVector[0]; | |
584 | 2 | dotProduct += dotProductVector[1]; | |
585 | 2 | dotProduct += dotProductVector[2]; | |
586 | 2 | dotProduct += dotProductVector[3]; | |
587 | 2 | dotProduct += dotProductVector[4]; | |
588 | 2 | dotProduct += dotProductVector[5]; | |
589 | 2 | dotProduct += dotProductVector[6]; | |
590 | 2 | dotProduct += dotProductVector[7]; | |
591 | |||
592 | 2 | number = thirtysecondPoints * 32; | |
593 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
594 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
595 | } | ||
596 | |||
597 | 2 | *result = (short)dotProduct; | |
598 | 2 | } | |
599 | |||
600 | #endif /*LV_HAVE_AVX*/ | ||
601 | |||
602 | #ifdef LV_HAVE_AVX512F | ||
603 | |||
604 | ✗ | static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, | |
605 | const float* input, | ||
606 | const float* taps, | ||
607 | unsigned int num_points) | ||
608 | { | ||
609 | |||
610 | ✗ | unsigned int number = 0; | |
611 | ✗ | const unsigned int sixtyfourthPoints = num_points / 64; | |
612 | |||
613 | ✗ | float dotProduct = 0; | |
614 | ✗ | const float* aPtr = input; | |
615 | ✗ | const float* bPtr = taps; | |
616 | |||
617 | __m512 a0Val, a1Val, a2Val, a3Val; | ||
618 | __m512 b0Val, b1Val, b2Val, b3Val; | ||
619 | |||
620 | ✗ | __m512 dotProdVal0 = _mm512_setzero_ps(); | |
621 | ✗ | __m512 dotProdVal1 = _mm512_setzero_ps(); | |
622 | ✗ | __m512 dotProdVal2 = _mm512_setzero_ps(); | |
623 | ✗ | __m512 dotProdVal3 = _mm512_setzero_ps(); | |
624 | |||
625 | ✗ | for (; number < sixtyfourthPoints; number++) { | |
626 | |||
627 | ✗ | a0Val = _mm512_loadu_ps(aPtr); | |
628 | ✗ | a1Val = _mm512_loadu_ps(aPtr + 16); | |
629 | ✗ | a2Val = _mm512_loadu_ps(aPtr + 32); | |
630 | ✗ | a3Val = _mm512_loadu_ps(aPtr + 48); | |
631 | ✗ | b0Val = _mm512_loadu_ps(bPtr); | |
632 | ✗ | b1Val = _mm512_loadu_ps(bPtr + 16); | |
633 | ✗ | b2Val = _mm512_loadu_ps(bPtr + 32); | |
634 | ✗ | b3Val = _mm512_loadu_ps(bPtr + 48); | |
635 | |||
636 | ✗ | dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
637 | ✗ | dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
638 | ✗ | dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
639 | ✗ | dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
640 | |||
641 | ✗ | aPtr += 64; | |
642 | ✗ | bPtr += 64; | |
643 | } | ||
644 | |||
645 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); | |
646 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); | |
647 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); | |
648 | |||
649 | __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; | ||
650 | |||
651 | _mm512_storeu_ps(dotProductVector, | ||
652 | dotProdVal0); // Store the results back into the dot product vector | ||
653 | |||
654 | ✗ | dotProduct = dotProductVector[0]; | |
655 | ✗ | dotProduct += dotProductVector[1]; | |
656 | ✗ | dotProduct += dotProductVector[2]; | |
657 | ✗ | dotProduct += dotProductVector[3]; | |
658 | ✗ | dotProduct += dotProductVector[4]; | |
659 | ✗ | dotProduct += dotProductVector[5]; | |
660 | ✗ | dotProduct += dotProductVector[6]; | |
661 | ✗ | dotProduct += dotProductVector[7]; | |
662 | ✗ | dotProduct += dotProductVector[8]; | |
663 | ✗ | dotProduct += dotProductVector[9]; | |
664 | ✗ | dotProduct += dotProductVector[10]; | |
665 | ✗ | dotProduct += dotProductVector[11]; | |
666 | ✗ | dotProduct += dotProductVector[12]; | |
667 | ✗ | dotProduct += dotProductVector[13]; | |
668 | ✗ | dotProduct += dotProductVector[14]; | |
669 | ✗ | dotProduct += dotProductVector[15]; | |
670 | |||
671 | ✗ | number = sixtyfourthPoints * 64; | |
672 | ✗ | for (; number < num_points; number++) { | |
673 | ✗ | dotProduct += ((*aPtr++) * (*bPtr++)); | |
674 | } | ||
675 | |||
676 | ✗ | *result = (short)dotProduct; | |
677 | ✗ | } | |
678 | |||
679 | #endif /*LV_HAVE_AVX512F*/ | ||
680 | |||
681 | |||
682 | #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/ | ||
683 |