Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16i_32fc_dot_prod_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * This block computes the dot product (or inner product) between two | ||
16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
17 | * num_points taps, the result is the sum of products between the two | ||
18 | * vectors. The result is a single value stored in the \p result | ||
19 | * address and will be complex. | ||
20 | * | ||
21 | * <b>Dispatcher Prototype</b> | ||
22 | * \code | ||
23 | * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t | ||
24 | * * taps, unsigned int num_points) \endcode | ||
25 | * | ||
26 | * \b Inputs | ||
27 | * \li input: vector of shorts. | ||
28 | * \li taps: complex taps. | ||
29 | * \li num_points: number of samples in both \p input and \p taps. | ||
30 | * | ||
31 | * \b Outputs | ||
32 | * \li result: pointer to a complex value to hold the dot product result. | ||
33 | * | ||
34 | * \b Example | ||
35 | * \code | ||
36 | * int N = 10000; | ||
37 | * | ||
38 | * <FIXME> | ||
39 | * | ||
40 | * volk_16i_32fc_dot_prod_32fc(); | ||
41 | * | ||
42 | * \endcode | ||
43 | */ | ||
44 | |||
45 | #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H | ||
46 | #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H | ||
47 | |||
48 | #include <stdio.h> | ||
49 | #include <volk/volk_common.h> | ||
50 | |||
51 | |||
52 | #ifdef LV_HAVE_GENERIC | ||
53 | |||
54 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, | |
55 | const short* input, | ||
56 | const lv_32fc_t* taps, | ||
57 | unsigned int num_points) | ||
58 | { | ||
59 | |||
60 | static const int N_UNROLL = 4; | ||
61 | |||
62 | 2 | lv_32fc_t acc0 = 0; | |
63 | 2 | lv_32fc_t acc1 = 0; | |
64 | 2 | lv_32fc_t acc2 = 0; | |
65 | 2 | lv_32fc_t acc3 = 0; | |
66 | |||
67 | 2 | unsigned i = 0; | |
68 | 2 | unsigned n = (num_points / N_UNROLL) * N_UNROLL; | |
69 | |||
70 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (i = 0; i < n; i += N_UNROLL) { |
71 | 65534 | acc0 += taps[i + 0] * (float)input[i + 0]; | |
72 | 65534 | acc1 += taps[i + 1] * (float)input[i + 1]; | |
73 | 65534 | acc2 += taps[i + 2] * (float)input[i + 2]; | |
74 | 65534 | acc3 += taps[i + 3] * (float)input[i + 3]; | |
75 | } | ||
76 | |||
77 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; i < num_points; i++) { |
78 | 6 | acc0 += taps[i] * (float)input[i]; | |
79 | } | ||
80 | |||
81 | 2 | *result = acc0 + acc1 + acc2 + acc3; | |
82 | 2 | } | |
83 | |||
84 | #endif /*LV_HAVE_GENERIC*/ | ||
85 | |||
86 | #ifdef LV_HAVE_NEON | ||
87 | #include <arm_neon.h> | ||
88 | static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, | ||
89 | const short* input, | ||
90 | const lv_32fc_t* taps, | ||
91 | unsigned int num_points) | ||
92 | { | ||
93 | |||
94 | unsigned ii; | ||
95 | unsigned quarter_points = num_points / 4; | ||
96 | lv_32fc_t* tapsPtr = (lv_32fc_t*)taps; | ||
97 | short* inputPtr = (short*)input; | ||
98 | lv_32fc_t accumulator_vec[4]; | ||
99 | |||
100 | float32x4x2_t tapsVal, accumulator_val; | ||
101 | int16x4_t input16; | ||
102 | int32x4_t input32; | ||
103 | float32x4_t input_float, prod_re, prod_im; | ||
104 | |||
105 | accumulator_val.val[0] = vdupq_n_f32(0.0); | ||
106 | accumulator_val.val[1] = vdupq_n_f32(0.0); | ||
107 | |||
108 | for (ii = 0; ii < quarter_points; ++ii) { | ||
109 | tapsVal = vld2q_f32((float*)tapsPtr); | ||
110 | input16 = vld1_s16(inputPtr); | ||
111 | // widen 16-bit int to 32-bit int | ||
112 | input32 = vmovl_s16(input16); | ||
113 | // convert 32-bit int to float with scale | ||
114 | input_float = vcvtq_f32_s32(input32); | ||
115 | |||
116 | prod_re = vmulq_f32(input_float, tapsVal.val[0]); | ||
117 | prod_im = vmulq_f32(input_float, tapsVal.val[1]); | ||
118 | |||
119 | accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); | ||
120 | accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); | ||
121 | |||
122 | tapsPtr += 4; | ||
123 | inputPtr += 4; | ||
124 | } | ||
125 | vst2q_f32((float*)accumulator_vec, accumulator_val); | ||
126 | accumulator_vec[0] += accumulator_vec[1]; | ||
127 | accumulator_vec[2] += accumulator_vec[3]; | ||
128 | accumulator_vec[0] += accumulator_vec[2]; | ||
129 | |||
130 | for (ii = quarter_points * 4; ii < num_points; ++ii) { | ||
131 | accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); | ||
132 | } | ||
133 | |||
134 | *result = accumulator_vec[0]; | ||
135 | } | ||
136 | |||
137 | #endif /*LV_HAVE_NEON*/ | ||
138 | |||
139 | #if LV_HAVE_SSE && LV_HAVE_MMX | ||
140 | |||
141 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result, | |
142 | const short* input, | ||
143 | const lv_32fc_t* taps, | ||
144 | unsigned int num_points) | ||
145 | { | ||
146 | |||
147 | 2 | unsigned int number = 0; | |
148 | 2 | const unsigned int eighthPoints = num_points / 8; | |
149 | |||
150 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
151 | 2 | const short* aPtr = input; | |
152 | 2 | const float* bPtr = (float*)taps; | |
153 | |||
154 | __m64 m0, m1; | ||
155 | __m128 f0, f1, f2, f3; | ||
156 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
157 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
158 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
159 | |||
160 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
161 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
162 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
163 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
164 | |||
165 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
166 | |||
167 | 32766 | m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); | |
168 | 65532 | m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); | |
169 | 32766 | f0 = _mm_cvtpi16_ps(m0); | |
170 | 32766 | f1 = _mm_cvtpi16_ps(m0); | |
171 | 32766 | f2 = _mm_cvtpi16_ps(m1); | |
172 | 32766 | f3 = _mm_cvtpi16_ps(m1); | |
173 | |||
174 | 32766 | a0Val = _mm_unpacklo_ps(f0, f1); | |
175 | 32766 | a1Val = _mm_unpackhi_ps(f0, f1); | |
176 | 32766 | a2Val = _mm_unpacklo_ps(f2, f3); | |
177 | 32766 | a3Val = _mm_unpackhi_ps(f2, f3); | |
178 | |||
179 | 32766 | b0Val = _mm_loadu_ps(bPtr); | |
180 | 32766 | b1Val = _mm_loadu_ps(bPtr + 4); | |
181 | 32766 | b2Val = _mm_loadu_ps(bPtr + 8); | |
182 | 65532 | b3Val = _mm_loadu_ps(bPtr + 12); | |
183 | |||
184 | 32766 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
185 | 32766 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
186 | 32766 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
187 | 32766 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
188 | |||
189 | 32766 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
190 | 32766 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
191 | 32766 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
192 | 32766 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
193 | |||
194 | 32766 | aPtr += 8; | |
195 | 32766 | bPtr += 16; | |
196 | } | ||
197 | |||
198 | _mm_empty(); // clear the mmx technology state | ||
199 | |||
200 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
201 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
202 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
203 | |||
204 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
205 | |||
206 | _mm_store_ps(dotProductVector, | ||
207 | dotProdVal0); // Store the results back into the dot product vector | ||
208 | |||
209 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
210 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
211 | |||
212 | 2 | number = eighthPoints * 8; | |
213 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
214 | 14 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]); | |
215 | 14 | aPtr += 1; | |
216 | 14 | bPtr += 2; | |
217 | } | ||
218 | |||
219 | 2 | *result = returnValue; | |
220 | 2 | } | |
221 | |||
222 | #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ | ||
223 | |||
224 | |||
225 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
226 | |||
227 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, | |
228 | const short* input, | ||
229 | const lv_32fc_t* taps, | ||
230 | unsigned int num_points) | ||
231 | { | ||
232 | |||
233 | 2 | unsigned int number = 0; | |
234 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
235 | |||
236 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
237 | 2 | const short* aPtr = input; | |
238 | 2 | const float* bPtr = (float*)taps; | |
239 | |||
240 | __m128i m0, m1; | ||
241 | __m256i f0, f1; | ||
242 | __m256 g0, g1, h0, h1, h2, h3; | ||
243 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
244 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
245 | |||
246 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
247 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
248 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
249 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
250 | |||
251 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
252 | |||
253 | 16382 | m0 = _mm_loadu_si128((__m128i const*)aPtr); | |
254 | 32764 | m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); | |
255 | |||
256 | 16382 | f0 = _mm256_cvtepi16_epi32(m0); | |
257 | 16382 | g0 = _mm256_cvtepi32_ps(f0); | |
258 | 16382 | f1 = _mm256_cvtepi16_epi32(m1); | |
259 | 16382 | g1 = _mm256_cvtepi32_ps(f1); | |
260 | |||
261 | 16382 | h0 = _mm256_unpacklo_ps(g0, g0); | |
262 | 16382 | h1 = _mm256_unpackhi_ps(g0, g0); | |
263 | 16382 | h2 = _mm256_unpacklo_ps(g1, g1); | |
264 | 16382 | h3 = _mm256_unpackhi_ps(g1, g1); | |
265 | |||
266 | 16382 | a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); | |
267 | 16382 | a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); | |
268 | 16382 | a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); | |
269 | 16382 | a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); | |
270 | |||
271 | 16382 | b0Val = _mm256_loadu_ps(bPtr); | |
272 | 16382 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
273 | 16382 | b2Val = _mm256_loadu_ps(bPtr + 16); | |
274 | 32764 | b3Val = _mm256_loadu_ps(bPtr + 24); | |
275 | |||
276 | 16382 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
277 | 16382 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
278 | 16382 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
279 | 16382 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
280 | |||
281 | 16382 | aPtr += 16; | |
282 | 16382 | bPtr += 32; | |
283 | } | ||
284 | |||
285 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
286 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
287 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
288 | |||
289 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
290 | |||
291 | _mm256_store_ps(dotProductVector, | ||
292 | dotProdVal0); // Store the results back into the dot product vector | ||
293 | |||
294 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
295 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
296 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
297 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
298 | |||
299 | 2 | number = sixteenthPoints * 16; | |
300 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
301 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]); | |
302 | 30 | aPtr += 1; | |
303 | 30 | bPtr += 2; | |
304 | } | ||
305 | |||
306 | 2 | *result = returnValue; | |
307 | 2 | } | |
308 | |||
309 | #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ | ||
310 | |||
311 | |||
312 | #ifdef LV_HAVE_AVX2 | ||
313 | |||
314 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result, | |
315 | const short* input, | ||
316 | const lv_32fc_t* taps, | ||
317 | unsigned int num_points) | ||
318 | { | ||
319 | |||
320 | 2 | unsigned int number = 0; | |
321 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
322 | |||
323 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
324 | 2 | const short* aPtr = input; | |
325 | 2 | const float* bPtr = (float*)taps; | |
326 | |||
327 | __m128i m0, m1; | ||
328 | __m256i f0, f1; | ||
329 | __m256 g0, g1, h0, h1, h2, h3; | ||
330 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
331 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
332 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
333 | |||
334 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
335 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
336 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
337 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
338 | |||
339 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
340 | |||
341 | 16382 | m0 = _mm_loadu_si128((__m128i const*)aPtr); | |
342 | 32764 | m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); | |
343 | |||
344 | 16382 | f0 = _mm256_cvtepi16_epi32(m0); | |
345 | 16382 | g0 = _mm256_cvtepi32_ps(f0); | |
346 | 16382 | f1 = _mm256_cvtepi16_epi32(m1); | |
347 | 16382 | g1 = _mm256_cvtepi32_ps(f1); | |
348 | |||
349 | 16382 | h0 = _mm256_unpacklo_ps(g0, g0); | |
350 | 16382 | h1 = _mm256_unpackhi_ps(g0, g0); | |
351 | 16382 | h2 = _mm256_unpacklo_ps(g1, g1); | |
352 | 16382 | h3 = _mm256_unpackhi_ps(g1, g1); | |
353 | |||
354 | 16382 | a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); | |
355 | 16382 | a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); | |
356 | 16382 | a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); | |
357 | 16382 | a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); | |
358 | |||
359 | 16382 | b0Val = _mm256_loadu_ps(bPtr); | |
360 | 16382 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
361 | 16382 | b2Val = _mm256_loadu_ps(bPtr + 16); | |
362 | 32764 | b3Val = _mm256_loadu_ps(bPtr + 24); | |
363 | |||
364 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
365 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
366 | 16382 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
367 | 16382 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
368 | |||
369 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
370 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
371 | 16382 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
372 | 16382 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
373 | |||
374 | 16382 | aPtr += 16; | |
375 | 16382 | bPtr += 32; | |
376 | } | ||
377 | |||
378 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
379 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
380 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
381 | |||
382 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
383 | |||
384 | _mm256_store_ps(dotProductVector, | ||
385 | dotProdVal0); // Store the results back into the dot product vector | ||
386 | |||
387 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
388 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
389 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
390 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
391 | |||
392 | 2 | number = sixteenthPoints * 16; | |
393 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
394 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]); | |
395 | 30 | aPtr += 1; | |
396 | 30 | bPtr += 2; | |
397 | } | ||
398 | |||
399 | 2 | *result = returnValue; | |
400 | 2 | } | |
401 | |||
402 | #endif /*LV_HAVE_AVX2*/ | ||
403 | |||
404 | |||
405 | #if LV_HAVE_SSE && LV_HAVE_MMX | ||
406 | |||
407 | |||
408 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result, | |
409 | const short* input, | ||
410 | const lv_32fc_t* taps, | ||
411 | unsigned int num_points) | ||
412 | { | ||
413 | |||
414 | 2 | unsigned int number = 0; | |
415 | 2 | const unsigned int eighthPoints = num_points / 8; | |
416 | |||
417 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
418 | 2 | const short* aPtr = input; | |
419 | 2 | const float* bPtr = (float*)taps; | |
420 | |||
421 | __m64 m0, m1; | ||
422 | __m128 f0, f1, f2, f3; | ||
423 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
424 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
425 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
426 | |||
427 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
428 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
429 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
430 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
431 | |||
432 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
433 | |||
434 | 32766 | m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); | |
435 | 65532 | m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); | |
436 | 32766 | f0 = _mm_cvtpi16_ps(m0); | |
437 | 32766 | f1 = _mm_cvtpi16_ps(m0); | |
438 | 32766 | f2 = _mm_cvtpi16_ps(m1); | |
439 | 32766 | f3 = _mm_cvtpi16_ps(m1); | |
440 | |||
441 | 32766 | a0Val = _mm_unpacklo_ps(f0, f1); | |
442 | 32766 | a1Val = _mm_unpackhi_ps(f0, f1); | |
443 | 32766 | a2Val = _mm_unpacklo_ps(f2, f3); | |
444 | 32766 | a3Val = _mm_unpackhi_ps(f2, f3); | |
445 | |||
446 | 32766 | b0Val = _mm_load_ps(bPtr); | |
447 | 32766 | b1Val = _mm_load_ps(bPtr + 4); | |
448 | 32766 | b2Val = _mm_load_ps(bPtr + 8); | |
449 | 65532 | b3Val = _mm_load_ps(bPtr + 12); | |
450 | |||
451 | 32766 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
452 | 32766 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
453 | 32766 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
454 | 32766 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
455 | |||
456 | 32766 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
457 | 32766 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
458 | 32766 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
459 | 32766 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
460 | |||
461 | 32766 | aPtr += 8; | |
462 | 32766 | bPtr += 16; | |
463 | } | ||
464 | |||
465 | _mm_empty(); // clear the mmx technology state | ||
466 | |||
467 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
468 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
469 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
470 | |||
471 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
472 | |||
473 | _mm_store_ps(dotProductVector, | ||
474 | dotProdVal0); // Store the results back into the dot product vector | ||
475 | |||
476 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
477 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
478 | |||
479 | 2 | number = eighthPoints * 8; | |
480 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
481 | 14 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]); | |
482 | 14 | aPtr += 1; | |
483 | 14 | bPtr += 2; | |
484 | } | ||
485 | |||
486 | 2 | *result = returnValue; | |
487 | 2 | } | |
488 | |||
489 | #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ | ||
490 | |||
491 | #ifdef LV_HAVE_AVX2 | ||
492 | |||
493 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result, | |
494 | const short* input, | ||
495 | const lv_32fc_t* taps, | ||
496 | unsigned int num_points) | ||
497 | { | ||
498 | |||
499 | 2 | unsigned int number = 0; | |
500 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
501 | |||
502 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
503 | 2 | const short* aPtr = input; | |
504 | 2 | const float* bPtr = (float*)taps; | |
505 | |||
506 | __m128i m0, m1; | ||
507 | __m256i f0, f1; | ||
508 | __m256 g0, g1, h0, h1, h2, h3; | ||
509 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
510 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
511 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
512 | |||
513 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
514 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
515 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
516 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
517 | |||
518 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
519 | |||
520 | 16382 | m0 = _mm_load_si128((__m128i const*)aPtr); | |
521 | 32764 | m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); | |
522 | |||
523 | 16382 | f0 = _mm256_cvtepi16_epi32(m0); | |
524 | 16382 | g0 = _mm256_cvtepi32_ps(f0); | |
525 | 16382 | f1 = _mm256_cvtepi16_epi32(m1); | |
526 | 16382 | g1 = _mm256_cvtepi32_ps(f1); | |
527 | |||
528 | 16382 | h0 = _mm256_unpacklo_ps(g0, g0); | |
529 | 16382 | h1 = _mm256_unpackhi_ps(g0, g0); | |
530 | 16382 | h2 = _mm256_unpacklo_ps(g1, g1); | |
531 | 16382 | h3 = _mm256_unpackhi_ps(g1, g1); | |
532 | |||
533 | 16382 | a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); | |
534 | 16382 | a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); | |
535 | 16382 | a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); | |
536 | 16382 | a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); | |
537 | |||
538 | 16382 | b0Val = _mm256_load_ps(bPtr); | |
539 | 16382 | b1Val = _mm256_load_ps(bPtr + 8); | |
540 | 16382 | b2Val = _mm256_load_ps(bPtr + 16); | |
541 | 32764 | b3Val = _mm256_load_ps(bPtr + 24); | |
542 | |||
543 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
544 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
545 | 16382 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
546 | 16382 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
547 | |||
548 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
549 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
550 | 16382 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
551 | 16382 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
552 | |||
553 | 16382 | aPtr += 16; | |
554 | 16382 | bPtr += 32; | |
555 | } | ||
556 | |||
557 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
558 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
559 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
560 | |||
561 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
562 | |||
563 | _mm256_store_ps(dotProductVector, | ||
564 | dotProdVal0); // Store the results back into the dot product vector | ||
565 | |||
566 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
567 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
568 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
569 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
570 | |||
571 | 2 | number = sixteenthPoints * 16; | |
572 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
573 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]); | |
574 | 30 | aPtr += 1; | |
575 | 30 | bPtr += 2; | |
576 | } | ||
577 | |||
578 | 2 | *result = returnValue; | |
579 | 2 | } | |
580 | |||
581 | |||
582 | #endif /*LV_HAVE_AVX2*/ | ||
583 | |||
584 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
585 | |||
586 | 2 | static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, | |
587 | const short* input, | ||
588 | const lv_32fc_t* taps, | ||
589 | unsigned int num_points) | ||
590 | { | ||
591 | |||
592 | 2 | unsigned int number = 0; | |
593 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
594 | |||
595 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
596 | 2 | const short* aPtr = input; | |
597 | 2 | const float* bPtr = (float*)taps; | |
598 | |||
599 | __m128i m0, m1; | ||
600 | __m256i f0, f1; | ||
601 | __m256 g0, g1, h0, h1, h2, h3; | ||
602 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
603 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
604 | |||
605 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
606 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
607 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
608 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
609 | |||
610 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
611 | |||
612 | 16382 | m0 = _mm_load_si128((__m128i const*)aPtr); | |
613 | 32764 | m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); | |
614 | |||
615 | 16382 | f0 = _mm256_cvtepi16_epi32(m0); | |
616 | 16382 | g0 = _mm256_cvtepi32_ps(f0); | |
617 | 16382 | f1 = _mm256_cvtepi16_epi32(m1); | |
618 | 16382 | g1 = _mm256_cvtepi32_ps(f1); | |
619 | |||
620 | 16382 | h0 = _mm256_unpacklo_ps(g0, g0); | |
621 | 16382 | h1 = _mm256_unpackhi_ps(g0, g0); | |
622 | 16382 | h2 = _mm256_unpacklo_ps(g1, g1); | |
623 | 16382 | h3 = _mm256_unpackhi_ps(g1, g1); | |
624 | |||
625 | 16382 | a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); | |
626 | 16382 | a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); | |
627 | 16382 | a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); | |
628 | 16382 | a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); | |
629 | |||
630 | 16382 | b0Val = _mm256_load_ps(bPtr); | |
631 | 16382 | b1Val = _mm256_load_ps(bPtr + 8); | |
632 | 16382 | b2Val = _mm256_load_ps(bPtr + 16); | |
633 | 32764 | b3Val = _mm256_load_ps(bPtr + 24); | |
634 | |||
635 | 16382 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
636 | 16382 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
637 | 16382 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
638 | 16382 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
639 | |||
640 | 16382 | aPtr += 16; | |
641 | 16382 | bPtr += 32; | |
642 | } | ||
643 | |||
644 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
645 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
646 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
647 | |||
648 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
649 | |||
650 | _mm256_store_ps(dotProductVector, | ||
651 | dotProdVal0); // Store the results back into the dot product vector | ||
652 | |||
653 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
654 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
655 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
656 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
657 | |||
658 | 2 | number = sixteenthPoints * 16; | |
659 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
660 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]); | |
661 | 30 | aPtr += 1; | |
662 | 30 | bPtr += 2; | |
663 | } | ||
664 | |||
665 | 2 | *result = returnValue; | |
666 | 2 | } | |
667 | |||
668 | |||
669 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
670 | |||
671 | |||
672 | #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/ | ||
673 |