Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2016 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_16ic_x2_dot_prod_16ic | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Multiplies two input complex vectors (16-bit integer each component) and accumulates | ||
16 | * them, storing the result. Results are saturated so never go beyond the limits of the | ||
17 | * data type. | ||
18 | * | ||
19 | * <b>Dispatcher Prototype</b> | ||
20 | * \code | ||
21 | * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const | ||
22 | * lv_16sc_t* in_b, unsigned int num_points); \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li in_a: One of the vectors to be multiplied and accumulated. | ||
26 | * \li in_b: The other vector to be multiplied and accumulated. | ||
27 | * \li num_points: Number of complex values to be multiplied together, accumulated and | ||
28 | * stored into \p result | ||
29 | * | ||
30 | * \b Outputs | ||
31 | * \li result: Value of the accumulated result. | ||
32 | * | ||
33 | */ | ||
34 | |||
35 | #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H | ||
36 | #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H | ||
37 | |||
38 | #include <volk/saturation_arithmetic.h> | ||
39 | #include <volk/volk_common.h> | ||
40 | #include <volk/volk_complex.h> | ||
41 | |||
42 | |||
43 | #ifdef LV_HAVE_GENERIC | ||
44 | |||
45 | 2 | static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, | |
46 | const lv_16sc_t* in_a, | ||
47 | const lv_16sc_t* in_b, | ||
48 | unsigned int num_points) | ||
49 | { | ||
50 | 2 | result[0] = lv_cmake((int16_t)0, (int16_t)0); | |
51 | unsigned int n; | ||
52 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (n = 0; n < num_points; n++) { |
53 | 262142 | lv_16sc_t tmp = in_a[n] * in_b[n]; | |
54 | 262142 | result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), | |
55 | sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp))); | ||
56 | } | ||
57 | 2 | } | |
58 | |||
59 | #endif /*LV_HAVE_GENERIC*/ | ||
60 | |||
61 | |||
62 | #ifdef LV_HAVE_SSE2 | ||
63 | #include <emmintrin.h> | ||
64 | |||
65 | 2 | static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, | |
66 | const lv_16sc_t* in_a, | ||
67 | const lv_16sc_t* in_b, | ||
68 | unsigned int num_points) | ||
69 | { | ||
70 | 2 | lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); | |
71 | |||
72 | 2 | const unsigned int sse_iters = num_points / 4; | |
73 | unsigned int number; | ||
74 | |||
75 | 2 | const lv_16sc_t* _in_a = in_a; | |
76 | 2 | const lv_16sc_t* _in_b = in_b; | |
77 | 2 | lv_16sc_t* _out = out; | |
78 | |||
79 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (sse_iters > 0) { |
80 | __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, | ||
81 | realcacc, imagcacc; | ||
82 | __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||
83 | |||
84 | 2 | realcacc = _mm_setzero_si128(); | |
85 | 2 | imagcacc = _mm_setzero_si128(); | |
86 | |||
87 | 2 | mask_imag = _mm_set_epi8( | |
88 | 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); | ||
89 | 2 | mask_real = _mm_set_epi8( | |
90 | 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); | ||
91 | |||
92 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (number = 0; number < sse_iters; number++) { |
93 | // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||
94 | 65534 | a = _mm_load_si128( | |
95 | (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||
96 | 65534 | __VOLK_PREFETCH(_in_a + 8); | |
97 | 65534 | b = _mm_load_si128((__m128i*)_in_b); | |
98 | 65534 | __VOLK_PREFETCH(_in_b + 8); | |
99 | 65534 | c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... | |
100 | |||
101 | 65534 | c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in | |
102 | // zeros, and store the results in dst. | ||
103 | 65534 | real = _mm_subs_epi16(c, c_sr); | |
104 | |||
105 | 65534 | b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... | |
106 | 65534 | a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... | |
107 | |||
108 | 65534 | imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | |
109 | 65534 | imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | |
110 | |||
111 | 65534 | imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! | |
112 | |||
113 | 65534 | realcacc = _mm_adds_epi16(realcacc, real); | |
114 | 65534 | imagcacc = _mm_adds_epi16(imagcacc, imag); | |
115 | |||
116 | 65534 | _in_a += 4; | |
117 | 65534 | _in_b += 4; | |
118 | } | ||
119 | |||
120 | 2 | realcacc = _mm_and_si128(realcacc, mask_real); | |
121 | 2 | imagcacc = _mm_and_si128(imagcacc, mask_imag); | |
122 | |||
123 | 2 | a = _mm_or_si128(realcacc, imagcacc); | |
124 | |||
125 | _mm_store_si128((__m128i*)dotProductVector, | ||
126 | a); // Store the results back into the dot product vector | ||
127 | |||
128 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; ++number) { |
129 | 8 | dotProduct = lv_cmake( | |
130 | sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), | ||
131 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); | ||
132 | } | ||
133 | } | ||
134 | |||
135 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (number = 0; number < (num_points % 4); ++number) { |
136 | 6 | lv_16sc_t tmp = (*_in_a++) * (*_in_b++); | |
137 | 6 | dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), | |
138 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); | ||
139 | } | ||
140 | |||
141 | 2 | *_out = dotProduct; | |
142 | 2 | } | |
143 | |||
144 | #endif /* LV_HAVE_SSE2 */ | ||
145 | |||
146 | |||
147 | #ifdef LV_HAVE_SSE2 | ||
148 | #include <emmintrin.h> | ||
149 | |||
150 | 2 | static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, | |
151 | const lv_16sc_t* in_a, | ||
152 | const lv_16sc_t* in_b, | ||
153 | unsigned int num_points) | ||
154 | { | ||
155 | 2 | lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); | |
156 | |||
157 | 2 | const unsigned int sse_iters = num_points / 4; | |
158 | |||
159 | 2 | const lv_16sc_t* _in_a = in_a; | |
160 | 2 | const lv_16sc_t* _in_b = in_b; | |
161 | 2 | lv_16sc_t* _out = out; | |
162 | unsigned int number; | ||
163 | |||
164 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (sse_iters > 0) { |
165 | __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, | ||
166 | realcacc, imagcacc, result; | ||
167 | __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||
168 | |||
169 | 2 | realcacc = _mm_setzero_si128(); | |
170 | 2 | imagcacc = _mm_setzero_si128(); | |
171 | |||
172 | 2 | mask_imag = _mm_set_epi8( | |
173 | 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); | ||
174 | 2 | mask_real = _mm_set_epi8( | |
175 | 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); | ||
176 | |||
177 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (number = 0; number < sse_iters; number++) { |
178 | // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||
179 | 65534 | a = _mm_loadu_si128( | |
180 | (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||
181 | 65534 | __VOLK_PREFETCH(_in_a + 8); | |
182 | 65534 | b = _mm_loadu_si128((__m128i*)_in_b); | |
183 | 65534 | __VOLK_PREFETCH(_in_b + 8); | |
184 | 65534 | c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... | |
185 | |||
186 | 65534 | c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in | |
187 | // zeros, and store the results in dst. | ||
188 | 65534 | real = _mm_subs_epi16(c, c_sr); | |
189 | |||
190 | 65534 | b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... | |
191 | 65534 | a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... | |
192 | |||
193 | 65534 | imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | |
194 | 65534 | imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | |
195 | |||
196 | 65534 | imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! | |
197 | |||
198 | 65534 | realcacc = _mm_adds_epi16(realcacc, real); | |
199 | 65534 | imagcacc = _mm_adds_epi16(imagcacc, imag); | |
200 | |||
201 | 65534 | _in_a += 4; | |
202 | 65534 | _in_b += 4; | |
203 | } | ||
204 | |||
205 | 2 | realcacc = _mm_and_si128(realcacc, mask_real); | |
206 | 2 | imagcacc = _mm_and_si128(imagcacc, mask_imag); | |
207 | |||
208 | 2 | result = _mm_or_si128(realcacc, imagcacc); | |
209 | |||
210 | _mm_storeu_si128((__m128i*)dotProductVector, | ||
211 | result); // Store the results back into the dot product vector | ||
212 | |||
213 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.
|
10 | for (number = 0; number < 4; ++number) { |
214 | 8 | dotProduct = lv_cmake( | |
215 | sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), | ||
216 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); | ||
217 | } | ||
218 | } | ||
219 | |||
220 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (number = 0; number < (num_points % 4); ++number) { |
221 | 6 | lv_16sc_t tmp = (*_in_a++) * (*_in_b++); | |
222 | 6 | dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), | |
223 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); | ||
224 | } | ||
225 | |||
226 | 2 | *_out = dotProduct; | |
227 | 2 | } | |
228 | #endif /* LV_HAVE_SSE2 */ | ||
229 | |||
230 | |||
231 | #ifdef LV_HAVE_AVX2 | ||
232 | #include <immintrin.h> | ||
233 | |||
234 | 2 | static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out, | |
235 | const lv_16sc_t* in_a, | ||
236 | const lv_16sc_t* in_b, | ||
237 | unsigned int num_points) | ||
238 | { | ||
239 | 2 | lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); | |
240 | |||
241 | 2 | const unsigned int avx_iters = num_points / 8; | |
242 | |||
243 | 2 | const lv_16sc_t* _in_a = in_a; | |
244 | 2 | const lv_16sc_t* _in_b = in_b; | |
245 | 2 | lv_16sc_t* _out = out; | |
246 | unsigned int number; | ||
247 | |||
248 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (avx_iters > 0) { |
249 | __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, | ||
250 | realcacc, imagcacc, result; | ||
251 | __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; | ||
252 | |||
253 | 2 | realcacc = _mm256_setzero_si256(); | |
254 | 2 | imagcacc = _mm256_setzero_si256(); | |
255 | |||
256 | 2 | mask_imag = _mm256_set_epi8(0xFF, | |
257 | 0xFF, | ||
258 | 0, | ||
259 | 0, | ||
260 | 0xFF, | ||
261 | 0xFF, | ||
262 | 0, | ||
263 | 0, | ||
264 | 0xFF, | ||
265 | 0xFF, | ||
266 | 0, | ||
267 | 0, | ||
268 | 0xFF, | ||
269 | 0xFF, | ||
270 | 0, | ||
271 | 0, | ||
272 | 0xFF, | ||
273 | 0xFF, | ||
274 | 0, | ||
275 | 0, | ||
276 | 0xFF, | ||
277 | 0xFF, | ||
278 | 0, | ||
279 | 0, | ||
280 | 0xFF, | ||
281 | 0xFF, | ||
282 | 0, | ||
283 | 0, | ||
284 | 0xFF, | ||
285 | 0xFF, | ||
286 | 0, | ||
287 | 0); | ||
288 | 2 | mask_real = _mm256_set_epi8(0, | |
289 | 0, | ||
290 | 0xFF, | ||
291 | 0xFF, | ||
292 | 0, | ||
293 | 0, | ||
294 | 0xFF, | ||
295 | 0xFF, | ||
296 | 0, | ||
297 | 0, | ||
298 | 0xFF, | ||
299 | 0xFF, | ||
300 | 0, | ||
301 | 0, | ||
302 | 0xFF, | ||
303 | 0xFF, | ||
304 | 0, | ||
305 | 0, | ||
306 | 0xFF, | ||
307 | 0xFF, | ||
308 | 0, | ||
309 | 0, | ||
310 | 0xFF, | ||
311 | 0xFF, | ||
312 | 0, | ||
313 | 0, | ||
314 | 0xFF, | ||
315 | 0xFF, | ||
316 | 0, | ||
317 | 0, | ||
318 | 0xFF, | ||
319 | 0xFF); | ||
320 | |||
321 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < avx_iters; number++) { |
322 | 32766 | a = _mm256_loadu_si256((__m256i*)_in_a); | |
323 | 32766 | __VOLK_PREFETCH(_in_a + 16); | |
324 | 32766 | b = _mm256_loadu_si256((__m256i*)_in_b); | |
325 | 32766 | __VOLK_PREFETCH(_in_b + 16); | |
326 | 32766 | c = _mm256_mullo_epi16(a, b); | |
327 | |||
328 | 32766 | c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting | |
329 | // in zeros, and store the results in dst. | ||
330 | 32766 | real = _mm256_subs_epi16(c, c_sr); | |
331 | |||
332 | 32766 | b_sl = _mm256_slli_si256(b, 2); | |
333 | 32766 | a_sl = _mm256_slli_si256(a, 2); | |
334 | |||
335 | 32766 | imag1 = _mm256_mullo_epi16(a, b_sl); | |
336 | 32766 | imag2 = _mm256_mullo_epi16(b, a_sl); | |
337 | |||
338 | 32766 | imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! | |
339 | |||
340 | 32766 | realcacc = _mm256_adds_epi16(realcacc, real); | |
341 | 32766 | imagcacc = _mm256_adds_epi16(imagcacc, imag); | |
342 | |||
343 | 32766 | _in_a += 8; | |
344 | 32766 | _in_b += 8; | |
345 | } | ||
346 | |||
347 | 2 | realcacc = _mm256_and_si256(realcacc, mask_real); | |
348 | 2 | imagcacc = _mm256_and_si256(imagcacc, mask_imag); | |
349 | |||
350 | 2 | result = _mm256_or_si256(realcacc, imagcacc); | |
351 | |||
352 | _mm256_storeu_si256((__m256i*)dotProductVector, | ||
353 | result); // Store the results back into the dot product vector | ||
354 | |||
355 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (number = 0; number < 8; ++number) { |
356 | 16 | dotProduct = lv_cmake( | |
357 | sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), | ||
358 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); | ||
359 | } | ||
360 | } | ||
361 | |||
362 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (number = 0; number < (num_points % 8); ++number) { |
363 | 14 | lv_16sc_t tmp = (*_in_a++) * (*_in_b++); | |
364 | 14 | dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), | |
365 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); | ||
366 | } | ||
367 | |||
368 | 2 | *_out = dotProduct; | |
369 | 2 | } | |
370 | #endif /* LV_HAVE_AVX2 */ | ||
371 | |||
372 | |||
373 | #ifdef LV_HAVE_AVX2 | ||
374 | #include <immintrin.h> | ||
375 | |||
376 | 2 | static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out, | |
377 | const lv_16sc_t* in_a, | ||
378 | const lv_16sc_t* in_b, | ||
379 | unsigned int num_points) | ||
380 | { | ||
381 | 2 | lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); | |
382 | |||
383 | 2 | const unsigned int avx_iters = num_points / 8; | |
384 | |||
385 | 2 | const lv_16sc_t* _in_a = in_a; | |
386 | 2 | const lv_16sc_t* _in_b = in_b; | |
387 | 2 | lv_16sc_t* _out = out; | |
388 | unsigned int number; | ||
389 | |||
390 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (avx_iters > 0) { |
391 | __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, | ||
392 | realcacc, imagcacc, result; | ||
393 | __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; | ||
394 | |||
395 | 2 | realcacc = _mm256_setzero_si256(); | |
396 | 2 | imagcacc = _mm256_setzero_si256(); | |
397 | |||
398 | 2 | mask_imag = _mm256_set_epi8(0xFF, | |
399 | 0xFF, | ||
400 | 0, | ||
401 | 0, | ||
402 | 0xFF, | ||
403 | 0xFF, | ||
404 | 0, | ||
405 | 0, | ||
406 | 0xFF, | ||
407 | 0xFF, | ||
408 | 0, | ||
409 | 0, | ||
410 | 0xFF, | ||
411 | 0xFF, | ||
412 | 0, | ||
413 | 0, | ||
414 | 0xFF, | ||
415 | 0xFF, | ||
416 | 0, | ||
417 | 0, | ||
418 | 0xFF, | ||
419 | 0xFF, | ||
420 | 0, | ||
421 | 0, | ||
422 | 0xFF, | ||
423 | 0xFF, | ||
424 | 0, | ||
425 | 0, | ||
426 | 0xFF, | ||
427 | 0xFF, | ||
428 | 0, | ||
429 | 0); | ||
430 | 2 | mask_real = _mm256_set_epi8(0, | |
431 | 0, | ||
432 | 0xFF, | ||
433 | 0xFF, | ||
434 | 0, | ||
435 | 0, | ||
436 | 0xFF, | ||
437 | 0xFF, | ||
438 | 0, | ||
439 | 0, | ||
440 | 0xFF, | ||
441 | 0xFF, | ||
442 | 0, | ||
443 | 0, | ||
444 | 0xFF, | ||
445 | 0xFF, | ||
446 | 0, | ||
447 | 0, | ||
448 | 0xFF, | ||
449 | 0xFF, | ||
450 | 0, | ||
451 | 0, | ||
452 | 0xFF, | ||
453 | 0xFF, | ||
454 | 0, | ||
455 | 0, | ||
456 | 0xFF, | ||
457 | 0xFF, | ||
458 | 0, | ||
459 | 0, | ||
460 | 0xFF, | ||
461 | 0xFF); | ||
462 | |||
463 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (number = 0; number < avx_iters; number++) { |
464 | 32766 | a = _mm256_load_si256((__m256i*)_in_a); | |
465 | 32766 | __VOLK_PREFETCH(_in_a + 16); | |
466 | 32766 | b = _mm256_load_si256((__m256i*)_in_b); | |
467 | 32766 | __VOLK_PREFETCH(_in_b + 16); | |
468 | 32766 | c = _mm256_mullo_epi16(a, b); | |
469 | |||
470 | 32766 | c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting | |
471 | // in zeros, and store the results in dst. | ||
472 | 32766 | real = _mm256_subs_epi16(c, c_sr); | |
473 | |||
474 | 32766 | b_sl = _mm256_slli_si256(b, 2); | |
475 | 32766 | a_sl = _mm256_slli_si256(a, 2); | |
476 | |||
477 | 32766 | imag1 = _mm256_mullo_epi16(a, b_sl); | |
478 | 32766 | imag2 = _mm256_mullo_epi16(b, a_sl); | |
479 | |||
480 | 32766 | imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! | |
481 | |||
482 | 32766 | realcacc = _mm256_adds_epi16(realcacc, real); | |
483 | 32766 | imagcacc = _mm256_adds_epi16(imagcacc, imag); | |
484 | |||
485 | 32766 | _in_a += 8; | |
486 | 32766 | _in_b += 8; | |
487 | } | ||
488 | |||
489 | 2 | realcacc = _mm256_and_si256(realcacc, mask_real); | |
490 | 2 | imagcacc = _mm256_and_si256(imagcacc, mask_imag); | |
491 | |||
492 | 2 | result = _mm256_or_si256(realcacc, imagcacc); | |
493 | |||
494 | _mm256_store_si256((__m256i*)dotProductVector, | ||
495 | result); // Store the results back into the dot product vector | ||
496 | |||
497 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | for (number = 0; number < 8; ++number) { |
498 | 16 | dotProduct = lv_cmake( | |
499 | sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), | ||
500 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); | ||
501 | } | ||
502 | } | ||
503 | |||
504 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (number = 0; number < (num_points % 8); ++number) { |
505 | 14 | lv_16sc_t tmp = (*_in_a++) * (*_in_b++); | |
506 | 14 | dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), | |
507 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); | ||
508 | } | ||
509 | |||
510 | 2 | *_out = dotProduct; | |
511 | 2 | } | |
512 | #endif /* LV_HAVE_AVX2 */ | ||
513 | |||
514 | |||
515 | #ifdef LV_HAVE_NEON | ||
516 | #include <arm_neon.h> | ||
517 | |||
518 | static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, | ||
519 | const lv_16sc_t* in_a, | ||
520 | const lv_16sc_t* in_b, | ||
521 | unsigned int num_points) | ||
522 | { | ||
523 | unsigned int quarter_points = num_points / 4; | ||
524 | unsigned int number; | ||
525 | |||
526 | lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; | ||
527 | lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; | ||
528 | *out = lv_cmake((int16_t)0, (int16_t)0); | ||
529 | |||
530 | if (quarter_points > 0) { | ||
531 | // for 2-lane vectors, 1st lane holds the real part, | ||
532 | // 2nd lane holds the imaginary part | ||
533 | int16x4x2_t a_val, b_val, c_val, accumulator; | ||
534 | int16x4x2_t tmp_real, tmp_imag; | ||
535 | __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; | ||
536 | accumulator.val[0] = vdup_n_s16(0); | ||
537 | accumulator.val[1] = vdup_n_s16(0); | ||
538 | lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); | ||
539 | |||
540 | for (number = 0; number < quarter_points; ++number) { | ||
541 | a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
542 | b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
543 | __VOLK_PREFETCH(a_ptr + 8); | ||
544 | __VOLK_PREFETCH(b_ptr + 8); | ||
545 | |||
546 | // multiply the real*real and imag*imag to get real result | ||
547 | // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||
548 | tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); | ||
549 | // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i | ||
550 | tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); | ||
551 | |||
552 | // Multiply cross terms to get the imaginary result | ||
553 | // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i | ||
554 | tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); | ||
555 | // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r | ||
556 | tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); | ||
557 | |||
558 | c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); | ||
559 | c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); | ||
560 | |||
561 | accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); | ||
562 | accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); | ||
563 | |||
564 | a_ptr += 4; | ||
565 | b_ptr += 4; | ||
566 | } | ||
567 | |||
568 | vst2_s16((int16_t*)accum_result, accumulator); | ||
569 | for (number = 0; number < 4; ++number) { | ||
570 | dotProduct = lv_cmake( | ||
571 | sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), | ||
572 | sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); | ||
573 | } | ||
574 | |||
575 | *out = dotProduct; | ||
576 | } | ||
577 | |||
578 | // tail case | ||
579 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
580 | *out += (*a_ptr++) * (*b_ptr++); | ||
581 | } | ||
582 | } | ||
583 | |||
584 | #endif /* LV_HAVE_NEON */ | ||
585 | |||
586 | |||
587 | #ifdef LV_HAVE_NEON | ||
588 | #include <arm_neon.h> | ||
589 | |||
590 | static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, | ||
591 | const lv_16sc_t* in_a, | ||
592 | const lv_16sc_t* in_b, | ||
593 | unsigned int num_points) | ||
594 | { | ||
595 | unsigned int quarter_points = num_points / 4; | ||
596 | unsigned int number; | ||
597 | |||
598 | lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; | ||
599 | lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; | ||
600 | // for 2-lane vectors, 1st lane holds the real part, | ||
601 | // 2nd lane holds the imaginary part | ||
602 | int16x4x2_t a_val, b_val, accumulator; | ||
603 | int16x4x2_t tmp; | ||
604 | __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; | ||
605 | accumulator.val[0] = vdup_n_s16(0); | ||
606 | accumulator.val[1] = vdup_n_s16(0); | ||
607 | |||
608 | for (number = 0; number < quarter_points; ++number) { | ||
609 | a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
610 | b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
611 | __VOLK_PREFETCH(a_ptr + 8); | ||
612 | __VOLK_PREFETCH(b_ptr + 8); | ||
613 | |||
614 | tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); | ||
615 | tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); | ||
616 | |||
617 | // use multiply accumulate/subtract to get result | ||
618 | tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); | ||
619 | tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); | ||
620 | |||
621 | accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); | ||
622 | accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); | ||
623 | |||
624 | a_ptr += 4; | ||
625 | b_ptr += 4; | ||
626 | } | ||
627 | |||
628 | vst2_s16((int16_t*)accum_result, accumulator); | ||
629 | *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
630 | |||
631 | // tail case | ||
632 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
633 | *out += (*a_ptr++) * (*b_ptr++); | ||
634 | } | ||
635 | } | ||
636 | |||
637 | #endif /* LV_HAVE_NEON */ | ||
638 | |||
639 | |||
640 | #ifdef LV_HAVE_NEON | ||
641 | #include <arm_neon.h> | ||
642 | |||
643 | static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, | ||
644 | const lv_16sc_t* in_a, | ||
645 | const lv_16sc_t* in_b, | ||
646 | unsigned int num_points) | ||
647 | { | ||
648 | unsigned int quarter_points = num_points / 4; | ||
649 | unsigned int number; | ||
650 | |||
651 | lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; | ||
652 | lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; | ||
653 | // for 2-lane vectors, 1st lane holds the real part, | ||
654 | // 2nd lane holds the imaginary part | ||
655 | int16x4x2_t a_val, b_val, accumulator1, accumulator2; | ||
656 | |||
657 | __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; | ||
658 | accumulator1.val[0] = vdup_n_s16(0); | ||
659 | accumulator1.val[1] = vdup_n_s16(0); | ||
660 | accumulator2.val[0] = vdup_n_s16(0); | ||
661 | accumulator2.val[1] = vdup_n_s16(0); | ||
662 | |||
663 | for (number = 0; number < quarter_points; ++number) { | ||
664 | a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
665 | b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
666 | __VOLK_PREFETCH(a_ptr + 8); | ||
667 | __VOLK_PREFETCH(b_ptr + 8); | ||
668 | |||
669 | // use 2 accumulators to remove inter-instruction data dependencies | ||
670 | accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); | ||
671 | accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); | ||
672 | accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); | ||
673 | accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); | ||
674 | |||
675 | a_ptr += 4; | ||
676 | b_ptr += 4; | ||
677 | } | ||
678 | |||
679 | accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]); | ||
680 | accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]); | ||
681 | |||
682 | vst2_s16((int16_t*)accum_result, accumulator1); | ||
683 | *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
684 | |||
685 | // tail case | ||
686 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
687 | *out += (*a_ptr++) * (*b_ptr++); | ||
688 | } | ||
689 | } | ||
690 | |||
691 | #endif /* LV_HAVE_NEON */ | ||
692 | |||
693 | #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/ | ||
694 |