Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_8u_x4_conv_k7_r2_8u | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * Performs convolutional decoding for a K=7, rate 1/2 convolutional | ||
16 | * code. The polynomials user defined. | ||
17 | * | ||
18 | * <b>Dispatcher Prototype</b> | ||
19 | * \code | ||
20 | * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, | ||
21 | * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* | ||
22 | * Branchtab) \endcode | ||
23 | * | ||
24 | * \b Inputs | ||
25 | * \li X: <FIXME> | ||
26 | * \li syms: <FIXME> | ||
27 | * \li dec: <FIXME> | ||
28 | * \li framebits: size of the frame to decode in bits. | ||
29 | * \li excess: <FIXME> | ||
30 | * \li Branchtab: <FIXME> | ||
31 | * | ||
32 | * \b Outputs | ||
33 | * \li Y: The decoded output bits. | ||
34 | * | ||
35 | * \b Example | ||
36 | * \code | ||
37 | * int N = 10000; | ||
38 | * | ||
39 | * volk_8u_x4_conv_k7_r2_8u(); | ||
40 | * | ||
41 | * volk_free(x); | ||
42 | * \endcode | ||
43 | */ | ||
44 | |||
45 | #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H | ||
46 | #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H | ||
47 | |||
48 | typedef union { | ||
49 | unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/]; | ||
50 | unsigned int w[64 /*NUMSTATES*/ / 32]; | ||
51 | unsigned short s[64 /*NUMSTATES*/ / 16]; | ||
52 | unsigned char c[64 /*NUMSTATES*/ / 8]; | ||
53 | #ifdef _MSC_VER | ||
54 | } decision_t; | ||
55 | #else | ||
56 | } decision_t __attribute__((aligned(16))); | ||
57 | #endif | ||
58 | |||
59 | |||
60 | 131074 | static inline void renormalize(unsigned char* X, unsigned char threshold) | |
61 | { | ||
62 | 131074 | int NUMSTATES = 64; | |
63 | int i; | ||
64 | |||
65 | 131074 | unsigned char min = X[0]; | |
66 | // if(min > threshold) { | ||
67 |
2/2✓ Branch 0 taken 8388736 times.
✓ Branch 1 taken 131074 times.
|
8519810 | for (i = 0; i < NUMSTATES; i++) |
68 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 8388736 times.
|
8388736 | if (min > X[i]) |
69 | ✗ | min = X[i]; | |
70 |
2/2✓ Branch 0 taken 8388736 times.
✓ Branch 1 taken 131074 times.
|
8519810 | for (i = 0; i < NUMSTATES; i++) |
71 | 8388736 | X[i] -= min; | |
72 | //} | ||
73 | 131074 | } | |
74 | |||
75 | |||
76 | // helper BFLY for GENERIC version | ||
77 | 4194304 | static inline void BFLY(int i, | |
78 | int s, | ||
79 | unsigned char* syms, | ||
80 | unsigned char* Y, | ||
81 | unsigned char* X, | ||
82 | decision_t* d, | ||
83 | unsigned char* Branchtab) | ||
84 | { | ||
85 | int j; | ||
86 | unsigned int decision0, decision1; | ||
87 | unsigned char metric, m0, m1, m2, m3; | ||
88 | |||
89 | 4194304 | int NUMSTATES = 64; | |
90 | 4194304 | int RATE = 2; | |
91 | 4194304 | int METRICSHIFT = 2; | |
92 | 4194304 | int PRECISIONSHIFT = 2; | |
93 | |||
94 | 4194304 | metric = 0; | |
95 |
2/2✓ Branch 0 taken 8388608 times.
✓ Branch 1 taken 4194304 times.
|
12582912 | for (j = 0; j < RATE; j++) |
96 | 8388608 | metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT; | |
97 | 4194304 | metric = metric >> PRECISIONSHIFT; | |
98 | |||
99 | 4194304 | unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); | |
100 | |||
101 | 4194304 | m0 = X[i] + metric; | |
102 | 4194304 | m1 = X[i + NUMSTATES / 2] + (max - metric); | |
103 | 4194304 | m2 = X[i] + (max - metric); | |
104 | 4194304 | m3 = X[i + NUMSTATES / 2] + metric; | |
105 | |||
106 | 4194304 | decision0 = (signed int)(m0 - m1) > 0; | |
107 | 4194304 | decision1 = (signed int)(m2 - m3) > 0; | |
108 | |||
109 |
2/2✓ Branch 0 taken 655660 times.
✓ Branch 1 taken 3538644 times.
|
4194304 | Y[2 * i] = decision0 ? m1 : m0; |
110 |
2/2✓ Branch 0 taken 1179926 times.
✓ Branch 1 taken 3014378 times.
|
4194304 | Y[2 * i + 1] = decision1 ? m3 : m2; |
111 | |||
112 | 4194304 | d->w[i / (sizeof(unsigned int) * 8 / 2) + | |
113 | 4194304 | s * (sizeof(decision_t) / sizeof(unsigned int))] |= | |
114 | 4194304 | (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1)); | |
115 | 4194304 | } | |
116 | |||
117 | |||
118 | //#if LV_HAVE_AVX2 | ||
119 | // | ||
120 | //#include <immintrin.h> | ||
121 | //#include <stdio.h> | ||
122 | // | ||
123 | // static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, | ||
124 | // unsigned char* X, | ||
125 | // unsigned char* syms, | ||
126 | // unsigned char* dec, | ||
127 | // unsigned int framebits, | ||
128 | // unsigned int excess, | ||
129 | // unsigned char* Branchtab) | ||
130 | //{ | ||
131 | // unsigned int i9; | ||
132 | // for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { | ||
133 | // unsigned char a75, a81; | ||
134 | // int a73, a92; | ||
135 | // int s20, s21; | ||
136 | // unsigned char *a80, *b6; | ||
137 | // int *a110, *a91, *a93; | ||
138 | // __m256i *a112, *a71, *a72, *a77, *a83, *a95; | ||
139 | // __m256i a86, a87; | ||
140 | // __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, | ||
141 | // m26, | ||
142 | // s18, s19, s22, s23, s24, s25, t13, t14, t15; | ||
143 | // a71 = ((__m256i*)X); | ||
144 | // s18 = *(a71); | ||
145 | // a72 = (a71 + 1); | ||
146 | // s19 = *(a72); | ||
147 | // s22 = _mm256_permute2x128_si256(s18, s19, 0x20); | ||
148 | // s19 = _mm256_permute2x128_si256(s18, s19, 0x31); | ||
149 | // s18 = s22; | ||
150 | // a73 = (4 * i9); | ||
151 | // b6 = (syms + a73); | ||
152 | // a75 = *(b6); | ||
153 | // a76 = _mm256_set1_epi8(a75); | ||
154 | // a77 = ((__m256i*)Branchtab); | ||
155 | // a78 = *(a77); | ||
156 | // a79 = _mm256_xor_si256(a76, a78); | ||
157 | // a80 = (b6 + 1); | ||
158 | // a81 = *(a80); | ||
159 | // a82 = _mm256_set1_epi8(a81); | ||
160 | // a83 = (a77 + 1); | ||
161 | // a84 = *(a83); | ||
162 | // a85 = _mm256_xor_si256(a82, a84); | ||
163 | // t13 = _mm256_avg_epu8(a79, a85); | ||
164 | // a86 = ((__m256i)t13); | ||
165 | // a87 = _mm256_srli_epi16(a86, 2); | ||
166 | // a88 = ((__m256i)a87); | ||
167 | // t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); | ||
168 | // t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); | ||
169 | // m23 = _mm256_adds_epu8(s18, t14); | ||
170 | // m24 = _mm256_adds_epu8(s19, t15); | ||
171 | // m25 = _mm256_adds_epu8(s18, t15); | ||
172 | // m26 = _mm256_adds_epu8(s19, t14); | ||
173 | // a89 = _mm256_min_epu8(m24, m23); | ||
174 | // d9 = _mm256_cmpeq_epi8(a89, m24); | ||
175 | // a90 = _mm256_min_epu8(m26, m25); | ||
176 | // d10 = _mm256_cmpeq_epi8(a90, m26); | ||
177 | // s22 = _mm256_unpacklo_epi8(d9, d10); | ||
178 | // s23 = _mm256_unpackhi_epi8(d9, d10); | ||
179 | // s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); | ||
180 | // a91 = ((int*)dec); | ||
181 | // a92 = (4 * i9); | ||
182 | // a93 = (a91 + a92); | ||
183 | // *(a93) = s20; | ||
184 | // s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); | ||
185 | // a110 = (a93 + 1); | ||
186 | // *(a110) = s21; | ||
187 | // s22 = _mm256_unpacklo_epi8(a89, a90); | ||
188 | // s23 = _mm256_unpackhi_epi8(a89, a90); | ||
189 | // a95 = ((__m256i*)Y); | ||
190 | // s24 = _mm256_permute2x128_si256(s22, s23, 0x20); | ||
191 | // *(a95) = s24; | ||
192 | // s23 = _mm256_permute2x128_si256(s22, s23, 0x31); | ||
193 | // a112 = (a95 + 1); | ||
194 | // *(a112) = s23; | ||
195 | // if ((((unsigned char*)Y)[0] > 210)) { | ||
196 | // __m256i m5, m6; | ||
197 | // m5 = ((__m256i*)Y)[0]; | ||
198 | // m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]); | ||
199 | // __m256i m7; | ||
200 | // m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); | ||
201 | // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)), | ||
202 | // ((__m256i)m7))); | ||
203 | // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)), | ||
204 | // ((__m256i)m7))); | ||
205 | // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)), | ||
206 | // ((__m256i)m7))); | ||
207 | // m7 = _mm256_unpacklo_epi8(m7, m7); | ||
208 | // m7 = _mm256_shufflelo_epi16(m7, 0); | ||
209 | // m6 = _mm256_unpacklo_epi64(m7, m7); | ||
210 | // m6 = _mm256_permute2x128_si256( | ||
211 | // m6, m6, 0); // copy lower half of m6 to upper half, since above ops | ||
212 | // // operate on 128 bit lanes | ||
213 | // ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6); | ||
214 | // ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6); | ||
215 | // } | ||
216 | // unsigned char a188, a194; | ||
217 | // int a205; | ||
218 | // int s48, s54; | ||
219 | // unsigned char *a187, *a193; | ||
220 | // int *a204, *a206, *a223, *b16; | ||
221 | // __m256i *a184, *a185, *a190, *a196, *a208, *a225; | ||
222 | // __m256i a199, a200; | ||
223 | // __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, | ||
224 | // m40, | ||
225 | // m41, m42, s46, s47, s50, s51, t25, t26, t27; | ||
226 | // a184 = ((__m256i*)Y); | ||
227 | // s46 = *(a184); | ||
228 | // a185 = (a184 + 1); | ||
229 | // s47 = *(a185); | ||
230 | // s50 = _mm256_permute2x128_si256(s46, s47, 0x20); | ||
231 | // s47 = _mm256_permute2x128_si256(s46, s47, 0x31); | ||
232 | // s46 = s50; | ||
233 | // a187 = (b6 + 2); | ||
234 | // a188 = *(a187); | ||
235 | // a189 = _mm256_set1_epi8(a188); | ||
236 | // a190 = ((__m256i*)Branchtab); | ||
237 | // a191 = *(a190); | ||
238 | // a192 = _mm256_xor_si256(a189, a191); | ||
239 | // a193 = (b6 + 3); | ||
240 | // a194 = *(a193); | ||
241 | // a195 = _mm256_set1_epi8(a194); | ||
242 | // a196 = (a190 + 1); | ||
243 | // a197 = *(a196); | ||
244 | // a198 = _mm256_xor_si256(a195, a197); | ||
245 | // t25 = _mm256_avg_epu8(a192, a198); | ||
246 | // a199 = ((__m256i)t25); | ||
247 | // a200 = _mm256_srli_epi16(a199, 2); | ||
248 | // a201 = ((__m256i)a200); | ||
249 | // t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); | ||
250 | // t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); | ||
251 | // m39 = _mm256_adds_epu8(s46, t26); | ||
252 | // m40 = _mm256_adds_epu8(s47, t27); | ||
253 | // m41 = _mm256_adds_epu8(s46, t27); | ||
254 | // m42 = _mm256_adds_epu8(s47, t26); | ||
255 | // a202 = _mm256_min_epu8(m40, m39); | ||
256 | // d17 = _mm256_cmpeq_epi8(a202, m40); | ||
257 | // a203 = _mm256_min_epu8(m42, m41); | ||
258 | // d18 = _mm256_cmpeq_epi8(a203, m42); | ||
259 | // s24 = _mm256_unpacklo_epi8(d17, d18); | ||
260 | // s25 = _mm256_unpackhi_epi8(d17, d18); | ||
261 | // s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); | ||
262 | // a204 = ((int*)dec); | ||
263 | // a205 = (4 * i9); | ||
264 | // b16 = (a204 + a205); | ||
265 | // a206 = (b16 + 2); | ||
266 | // *(a206) = s48; | ||
267 | // s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); | ||
268 | // a223 = (b16 + 3); | ||
269 | // *(a223) = s54; | ||
270 | // s50 = _mm256_unpacklo_epi8(a202, a203); | ||
271 | // s51 = _mm256_unpackhi_epi8(a202, a203); | ||
272 | // s25 = _mm256_permute2x128_si256(s50, s51, 0x20); | ||
273 | // s51 = _mm256_permute2x128_si256(s50, s51, 0x31); | ||
274 | // a208 = ((__m256i*)X); | ||
275 | // *(a208) = s25; | ||
276 | // a225 = (a208 + 1); | ||
277 | // *(a225) = s51; | ||
278 | // | ||
279 | // if ((((unsigned char*)X)[0] > 210)) { | ||
280 | // __m256i m12, m13; | ||
281 | // m12 = ((__m256i*)X)[0]; | ||
282 | // m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]); | ||
283 | // __m256i m14; | ||
284 | // m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); | ||
285 | // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)), | ||
286 | // ((__m256i)m14))); | ||
287 | // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)), | ||
288 | // ((__m256i)m14))); | ||
289 | // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)), | ||
290 | // ((__m256i)m14))); | ||
291 | // m14 = _mm256_unpacklo_epi8(m14, m14); | ||
292 | // m14 = _mm256_shufflelo_epi16(m14, 0); | ||
293 | // m13 = _mm256_unpacklo_epi64(m14, m14); | ||
294 | // m13 = _mm256_permute2x128_si256(m13, m13, 0); | ||
295 | // ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13); | ||
296 | // ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13); | ||
297 | // } | ||
298 | // } | ||
299 | // | ||
300 | // renormalize(X, 210); | ||
301 | // | ||
302 | // unsigned int j; | ||
303 | // for (j = 0; j < (framebits + excess) % 2; ++j) { | ||
304 | // int i; | ||
305 | // for (i = 0; i < 64 / 2; i++) { | ||
306 | // BFLY(i, | ||
307 | // (((framebits + excess) >> 1) << 1) + j, | ||
308 | // syms, | ||
309 | // Y, | ||
310 | // X, | ||
311 | // (decision_t*)dec, | ||
312 | // Branchtab); | ||
313 | // } | ||
314 | // | ||
315 | // renormalize(Y, 210); | ||
316 | // } | ||
317 | // /*skip*/ | ||
318 | //} | ||
319 | // | ||
320 | //#endif /*LV_HAVE_AVX2*/ | ||
321 | |||
322 | |||
323 | #if LV_HAVE_SSE3 | ||
324 | |||
325 | #include <emmintrin.h> | ||
326 | #include <mmintrin.h> | ||
327 | #include <pmmintrin.h> | ||
328 | #include <stdio.h> | ||
329 | #include <xmmintrin.h> | ||
330 | |||
331 | 2 | static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, | |
332 | unsigned char* X, | ||
333 | unsigned char* syms, | ||
334 | unsigned char* dec, | ||
335 | unsigned int framebits, | ||
336 | unsigned int excess, | ||
337 | unsigned char* Branchtab) | ||
338 | { | ||
339 | unsigned int i9; | ||
340 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { |
341 | unsigned char a75, a81; | ||
342 | int a73, a92; | ||
343 | short int s20, s21, s26, s27; | ||
344 | unsigned char *a74, *a80, *b6; | ||
345 | short int *a110, *a111, *a91, *a93, *a94; | ||
346 | __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99; | ||
347 | __m128i a105, a106, a86, a87; | ||
348 | __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85, | ||
349 | a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18, | ||
350 | s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18; | ||
351 | 65534 | a71 = ((__m128i*)X); | |
352 | 65534 | s18 = *(a71); | |
353 | 65534 | a72 = (a71 + 2); | |
354 | 65534 | s19 = *(a72); | |
355 | 65534 | a73 = (4 * i9); | |
356 | 65534 | a74 = (syms + a73); | |
357 | 65534 | a75 = *(a74); | |
358 | 65534 | a76 = _mm_set1_epi8(a75); | |
359 | 65534 | a77 = ((__m128i*)Branchtab); | |
360 | 65534 | a78 = *(a77); | |
361 | 65534 | a79 = _mm_xor_si128(a76, a78); | |
362 | 65534 | b6 = (a73 + syms); | |
363 | 65534 | a80 = (b6 + 1); | |
364 | 65534 | a81 = *(a80); | |
365 | 65534 | a82 = _mm_set1_epi8(a81); | |
366 | 65534 | a83 = (a77 + 2); | |
367 | 65534 | a84 = *(a83); | |
368 | 65534 | a85 = _mm_xor_si128(a82, a84); | |
369 | 65534 | t13 = _mm_avg_epu8(a79, a85); | |
370 | 65534 | a86 = ((__m128i)t13); | |
371 | 65534 | a87 = _mm_srli_epi16(a86, 2); | |
372 | 65534 | a88 = ((__m128i)a87); | |
373 | 131068 | t14 = _mm_and_si128( | |
374 | a88, | ||
375 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
376 | 131068 | t15 = _mm_subs_epu8( | |
377 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
378 | t14); | ||
379 | 65534 | m23 = _mm_adds_epu8(s18, t14); | |
380 | 65534 | m24 = _mm_adds_epu8(s19, t15); | |
381 | 65534 | m25 = _mm_adds_epu8(s18, t15); | |
382 | 65534 | m26 = _mm_adds_epu8(s19, t14); | |
383 | 65534 | a89 = _mm_min_epu8(m24, m23); | |
384 | 65534 | d9 = _mm_cmpeq_epi8(a89, m24); | |
385 | 65534 | a90 = _mm_min_epu8(m26, m25); | |
386 | 65534 | d10 = _mm_cmpeq_epi8(a90, m26); | |
387 | 65534 | s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10)); | |
388 | 65534 | a91 = ((short int*)dec); | |
389 | 65534 | a92 = (8 * i9); | |
390 | 65534 | a93 = (a91 + a92); | |
391 | 65534 | *(a93) = s20; | |
392 | 65534 | s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10)); | |
393 | 65534 | a94 = (a93 + 1); | |
394 | 65534 | *(a94) = s21; | |
395 | 65534 | s22 = _mm_unpacklo_epi8(a89, a90); | |
396 | 65534 | s23 = _mm_unpackhi_epi8(a89, a90); | |
397 | 65534 | a95 = ((__m128i*)Y); | |
398 | 65534 | *(a95) = s22; | |
399 | 65534 | a96 = (a95 + 1); | |
400 | 65534 | *(a96) = s23; | |
401 | 65534 | a97 = (a71 + 1); | |
402 | 65534 | s24 = *(a97); | |
403 | 65534 | a98 = (a71 + 3); | |
404 | 65534 | s25 = *(a98); | |
405 | 65534 | a99 = (a77 + 1); | |
406 | 65534 | a100 = *(a99); | |
407 | 65534 | a101 = _mm_xor_si128(a76, a100); | |
408 | 65534 | a102 = (a77 + 3); | |
409 | 65534 | a103 = *(a102); | |
410 | 65534 | a104 = _mm_xor_si128(a82, a103); | |
411 | 65534 | t16 = _mm_avg_epu8(a101, a104); | |
412 | 65534 | a105 = ((__m128i)t16); | |
413 | 65534 | a106 = _mm_srli_epi16(a105, 2); | |
414 | 65534 | a107 = ((__m128i)a106); | |
415 | 131068 | t17 = _mm_and_si128( | |
416 | a107, | ||
417 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
418 | 131068 | t18 = _mm_subs_epu8( | |
419 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
420 | t17); | ||
421 | 65534 | m27 = _mm_adds_epu8(s24, t17); | |
422 | 65534 | m28 = _mm_adds_epu8(s25, t18); | |
423 | 65534 | m29 = _mm_adds_epu8(s24, t18); | |
424 | 65534 | m30 = _mm_adds_epu8(s25, t17); | |
425 | 65534 | a108 = _mm_min_epu8(m28, m27); | |
426 | 65534 | d11 = _mm_cmpeq_epi8(a108, m28); | |
427 | 65534 | a109 = _mm_min_epu8(m30, m29); | |
428 | 65534 | d12 = _mm_cmpeq_epi8(a109, m30); | |
429 | 65534 | s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12)); | |
430 | 65534 | a110 = (a93 + 2); | |
431 | 65534 | *(a110) = s26; | |
432 | 65534 | s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12)); | |
433 | 65534 | a111 = (a93 + 3); | |
434 | 65534 | *(a111) = s27; | |
435 | 65534 | s28 = _mm_unpacklo_epi8(a108, a109); | |
436 | 65534 | s29 = _mm_unpackhi_epi8(a108, a109); | |
437 | 65534 | a112 = (a95 + 2); | |
438 | 65534 | *(a112) = s28; | |
439 | 65534 | a113 = (a95 + 3); | |
440 | 65534 | *(a113) = s29; | |
441 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 65534 times.
|
65534 | if ((((unsigned char*)Y)[0] > 210)) { |
442 | __m128i m5, m6; | ||
443 | ✗ | m5 = ((__m128i*)Y)[0]; | |
444 | ✗ | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); | |
445 | ✗ | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); | |
446 | ✗ | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); | |
447 | __m128i m7; | ||
448 | ✗ | m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); | |
449 | m7 = | ||
450 | ✗ | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); | |
451 | m7 = | ||
452 | ✗ | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); | |
453 | ✗ | m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); | |
454 | ✗ | m7 = _mm_unpacklo_epi8(m7, m7); | |
455 | ✗ | m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); | |
456 | ✗ | m6 = _mm_unpacklo_epi64(m7, m7); | |
457 | ✗ | ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); | |
458 | ✗ | ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); | |
459 | ✗ | ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); | |
460 | ✗ | ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); | |
461 | } | ||
462 | unsigned char a188, a194; | ||
463 | int a186, a205; | ||
464 | short int s48, s49, s54, s55; | ||
465 | unsigned char *a187, *a193, *b15; | ||
466 | short int *a204, *a206, *a207, *a223, *a224, *b16; | ||
467 | __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215, | ||
468 | *a225, *a226; | ||
469 | __m128i a199, a200, a218, a219; | ||
470 | __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216, | ||
471 | a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45, | ||
472 | m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30; | ||
473 | 65534 | a184 = ((__m128i*)Y); | |
474 | 65534 | s46 = *(a184); | |
475 | 65534 | a185 = (a184 + 2); | |
476 | 65534 | s47 = *(a185); | |
477 | 65534 | a186 = (4 * i9); | |
478 | 65534 | b15 = (a186 + syms); | |
479 | 65534 | a187 = (b15 + 2); | |
480 | 65534 | a188 = *(a187); | |
481 | 65534 | a189 = _mm_set1_epi8(a188); | |
482 | 65534 | a190 = ((__m128i*)Branchtab); | |
483 | 65534 | a191 = *(a190); | |
484 | 65534 | a192 = _mm_xor_si128(a189, a191); | |
485 | 65534 | a193 = (b15 + 3); | |
486 | 65534 | a194 = *(a193); | |
487 | 65534 | a195 = _mm_set1_epi8(a194); | |
488 | 65534 | a196 = (a190 + 2); | |
489 | 65534 | a197 = *(a196); | |
490 | 65534 | a198 = _mm_xor_si128(a195, a197); | |
491 | 65534 | t25 = _mm_avg_epu8(a192, a198); | |
492 | 65534 | a199 = ((__m128i)t25); | |
493 | 65534 | a200 = _mm_srli_epi16(a199, 2); | |
494 | 65534 | a201 = ((__m128i)a200); | |
495 | 131068 | t26 = _mm_and_si128( | |
496 | a201, | ||
497 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
498 | 131068 | t27 = _mm_subs_epu8( | |
499 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
500 | t26); | ||
501 | 65534 | m39 = _mm_adds_epu8(s46, t26); | |
502 | 65534 | m40 = _mm_adds_epu8(s47, t27); | |
503 | 65534 | m41 = _mm_adds_epu8(s46, t27); | |
504 | 65534 | m42 = _mm_adds_epu8(s47, t26); | |
505 | 65534 | a202 = _mm_min_epu8(m40, m39); | |
506 | 65534 | d17 = _mm_cmpeq_epi8(a202, m40); | |
507 | 65534 | a203 = _mm_min_epu8(m42, m41); | |
508 | 65534 | d18 = _mm_cmpeq_epi8(a203, m42); | |
509 | 65534 | s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18)); | |
510 | 65534 | a204 = ((short int*)dec); | |
511 | 65534 | a205 = (8 * i9); | |
512 | 65534 | b16 = (a204 + a205); | |
513 | 65534 | a206 = (b16 + 4); | |
514 | 65534 | *(a206) = s48; | |
515 | 65534 | s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18)); | |
516 | 65534 | a207 = (b16 + 5); | |
517 | 65534 | *(a207) = s49; | |
518 | 65534 | s50 = _mm_unpacklo_epi8(a202, a203); | |
519 | 65534 | s51 = _mm_unpackhi_epi8(a202, a203); | |
520 | 65534 | a208 = ((__m128i*)X); | |
521 | 65534 | *(a208) = s50; | |
522 | 65534 | a209 = (a208 + 1); | |
523 | 65534 | *(a209) = s51; | |
524 | 65534 | a210 = (a184 + 1); | |
525 | 65534 | s52 = *(a210); | |
526 | 65534 | a211 = (a184 + 3); | |
527 | 65534 | s53 = *(a211); | |
528 | 65534 | a212 = (a190 + 1); | |
529 | 65534 | a213 = *(a212); | |
530 | 65534 | a214 = _mm_xor_si128(a189, a213); | |
531 | 65534 | a215 = (a190 + 3); | |
532 | 65534 | a216 = *(a215); | |
533 | 65534 | a217 = _mm_xor_si128(a195, a216); | |
534 | 65534 | t28 = _mm_avg_epu8(a214, a217); | |
535 | 65534 | a218 = ((__m128i)t28); | |
536 | 65534 | a219 = _mm_srli_epi16(a218, 2); | |
537 | 65534 | a220 = ((__m128i)a219); | |
538 | 131068 | t29 = _mm_and_si128( | |
539 | a220, | ||
540 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
541 | 131068 | t30 = _mm_subs_epu8( | |
542 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
543 | t29); | ||
544 | 65534 | m43 = _mm_adds_epu8(s52, t29); | |
545 | 65534 | m44 = _mm_adds_epu8(s53, t30); | |
546 | 65534 | m45 = _mm_adds_epu8(s52, t30); | |
547 | 65534 | m46 = _mm_adds_epu8(s53, t29); | |
548 | 65534 | a221 = _mm_min_epu8(m44, m43); | |
549 | 65534 | d19 = _mm_cmpeq_epi8(a221, m44); | |
550 | 65534 | a222 = _mm_min_epu8(m46, m45); | |
551 | 65534 | d20 = _mm_cmpeq_epi8(a222, m46); | |
552 | 65534 | s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20)); | |
553 | 65534 | a223 = (b16 + 6); | |
554 | 65534 | *(a223) = s54; | |
555 | 65534 | s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20)); | |
556 | 65534 | a224 = (b16 + 7); | |
557 | 65534 | *(a224) = s55; | |
558 | 65534 | s56 = _mm_unpacklo_epi8(a221, a222); | |
559 | 65534 | s57 = _mm_unpackhi_epi8(a221, a222); | |
560 | 65534 | a225 = (a208 + 2); | |
561 | 65534 | *(a225) = s56; | |
562 | 65534 | a226 = (a208 + 3); | |
563 | 65534 | *(a226) = s57; | |
564 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 65534 times.
|
65534 | if ((((unsigned char*)X)[0] > 210)) { |
565 | __m128i m12, m13; | ||
566 | ✗ | m12 = ((__m128i*)X)[0]; | |
567 | ✗ | m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); | |
568 | ✗ | m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); | |
569 | ✗ | m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); | |
570 | __m128i m14; | ||
571 | ✗ | m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); | |
572 | ✗ | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), | |
573 | ((__m128i)m14))); | ||
574 | ✗ | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), | |
575 | ((__m128i)m14))); | ||
576 | ✗ | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), | |
577 | ((__m128i)m14))); | ||
578 | ✗ | m14 = _mm_unpacklo_epi8(m14, m14); | |
579 | ✗ | m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); | |
580 | ✗ | m13 = _mm_unpacklo_epi64(m14, m14); | |
581 | ✗ | ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); | |
582 | ✗ | ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); | |
583 | ✗ | ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); | |
584 | ✗ | ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); | |
585 | } | ||
586 | } | ||
587 | |||
588 | 2 | renormalize(X, 210); | |
589 | |||
590 | /*int ch; | ||
591 | for(ch = 0; ch < 64; ch++) { | ||
592 | printf("%d,", X[ch]); | ||
593 | } | ||
594 | printf("\n");*/ | ||
595 | |||
596 | unsigned int j; | ||
597 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (j = 0; j < (framebits + excess) % 2; ++j) { |
598 | int i; | ||
599 |
2/2✓ Branch 0 taken 64 times.
✓ Branch 1 taken 2 times.
|
66 | for (i = 0; i < 64 / 2; i++) { |
600 | 64 | BFLY(i, | |
601 | 64 | (((framebits + excess) >> 1) << 1) + j, | |
602 | syms, | ||
603 | Y, | ||
604 | X, | ||
605 | (decision_t*)dec, | ||
606 | Branchtab); | ||
607 | } | ||
608 | |||
609 | |||
610 | 2 | renormalize(Y, 210); | |
611 | |||
612 | /*printf("\n"); | ||
613 | for(ch = 0; ch < 64; ch++) { | ||
614 | printf("%d,", Y[ch]); | ||
615 | } | ||
616 | printf("\n");*/ | ||
617 | } | ||
618 | /*skip*/ | ||
619 | 2 | } | |
620 | |||
621 | #endif /*LV_HAVE_SSE3*/ | ||
622 | |||
623 | #if LV_HAVE_NEON | ||
624 | |||
625 | #include "volk/sse2neon.h" | ||
626 | |||
627 | static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, | ||
628 | unsigned char* X, | ||
629 | unsigned char* syms, | ||
630 | unsigned char* dec, | ||
631 | unsigned int framebits, | ||
632 | unsigned int excess, | ||
633 | unsigned char* Branchtab) | ||
634 | { | ||
635 | unsigned int i9; | ||
636 | for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { | ||
637 | unsigned char a75, a81; | ||
638 | int a73, a92; | ||
639 | short int s20, s21, s26, s27; | ||
640 | unsigned char *a74, *a80, *b6; | ||
641 | short int *a110, *a111, *a91, *a93, *a94; | ||
642 | __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99; | ||
643 | __m128i a105, a106, a86, a87; | ||
644 | __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85, | ||
645 | a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18, | ||
646 | s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18; | ||
647 | a71 = ((__m128i*)X); | ||
648 | s18 = *(a71); | ||
649 | a72 = (a71 + 2); | ||
650 | s19 = *(a72); | ||
651 | a73 = (4 * i9); | ||
652 | a74 = (syms + a73); | ||
653 | a75 = *(a74); | ||
654 | a76 = _mm_set1_epi8(a75); | ||
655 | a77 = ((__m128i*)Branchtab); | ||
656 | a78 = *(a77); | ||
657 | a79 = _mm_xor_si128(a76, a78); | ||
658 | b6 = (a73 + syms); | ||
659 | a80 = (b6 + 1); | ||
660 | a81 = *(a80); | ||
661 | a82 = _mm_set1_epi8(a81); | ||
662 | a83 = (a77 + 2); | ||
663 | a84 = *(a83); | ||
664 | a85 = _mm_xor_si128(a82, a84); | ||
665 | t13 = _mm_avg_epu8(a79, a85); | ||
666 | a86 = ((__m128i)t13); | ||
667 | a87 = _mm_srli_epi16(a86, 2); | ||
668 | a88 = ((__m128i)a87); | ||
669 | t14 = _mm_and_si128( | ||
670 | a88, | ||
671 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
672 | t15 = _mm_subs_epu8( | ||
673 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
674 | t14); | ||
675 | m23 = _mm_adds_epu8(s18, t14); | ||
676 | m24 = _mm_adds_epu8(s19, t15); | ||
677 | m25 = _mm_adds_epu8(s18, t15); | ||
678 | m26 = _mm_adds_epu8(s19, t14); | ||
679 | a89 = _mm_min_epu8(m24, m23); | ||
680 | d9 = _mm_cmpeq_epi8(a89, m24); | ||
681 | a90 = _mm_min_epu8(m26, m25); | ||
682 | d10 = _mm_cmpeq_epi8(a90, m26); | ||
683 | s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10)); | ||
684 | a91 = ((short int*)dec); | ||
685 | a92 = (8 * i9); | ||
686 | a93 = (a91 + a92); | ||
687 | *(a93) = s20; | ||
688 | s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10)); | ||
689 | a94 = (a93 + 1); | ||
690 | *(a94) = s21; | ||
691 | s22 = _mm_unpacklo_epi8(a89, a90); | ||
692 | s23 = _mm_unpackhi_epi8(a89, a90); | ||
693 | a95 = ((__m128i*)Y); | ||
694 | *(a95) = s22; | ||
695 | a96 = (a95 + 1); | ||
696 | *(a96) = s23; | ||
697 | a97 = (a71 + 1); | ||
698 | s24 = *(a97); | ||
699 | a98 = (a71 + 3); | ||
700 | s25 = *(a98); | ||
701 | a99 = (a77 + 1); | ||
702 | a100 = *(a99); | ||
703 | a101 = _mm_xor_si128(a76, a100); | ||
704 | a102 = (a77 + 3); | ||
705 | a103 = *(a102); | ||
706 | a104 = _mm_xor_si128(a82, a103); | ||
707 | t16 = _mm_avg_epu8(a101, a104); | ||
708 | a105 = ((__m128i)t16); | ||
709 | a106 = _mm_srli_epi16(a105, 2); | ||
710 | a107 = ((__m128i)a106); | ||
711 | t17 = _mm_and_si128( | ||
712 | a107, | ||
713 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
714 | t18 = _mm_subs_epu8( | ||
715 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
716 | t17); | ||
717 | m27 = _mm_adds_epu8(s24, t17); | ||
718 | m28 = _mm_adds_epu8(s25, t18); | ||
719 | m29 = _mm_adds_epu8(s24, t18); | ||
720 | m30 = _mm_adds_epu8(s25, t17); | ||
721 | a108 = _mm_min_epu8(m28, m27); | ||
722 | d11 = _mm_cmpeq_epi8(a108, m28); | ||
723 | a109 = _mm_min_epu8(m30, m29); | ||
724 | d12 = _mm_cmpeq_epi8(a109, m30); | ||
725 | s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12)); | ||
726 | a110 = (a93 + 2); | ||
727 | *(a110) = s26; | ||
728 | s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12)); | ||
729 | a111 = (a93 + 3); | ||
730 | *(a111) = s27; | ||
731 | s28 = _mm_unpacklo_epi8(a108, a109); | ||
732 | s29 = _mm_unpackhi_epi8(a108, a109); | ||
733 | a112 = (a95 + 2); | ||
734 | *(a112) = s28; | ||
735 | a113 = (a95 + 3); | ||
736 | *(a113) = s29; | ||
737 | if ((((unsigned char*)Y)[0] > 210)) { | ||
738 | __m128i m5, m6; | ||
739 | m5 = ((__m128i*)Y)[0]; | ||
740 | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); | ||
741 | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); | ||
742 | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); | ||
743 | __m128i m7; | ||
744 | m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); | ||
745 | m7 = | ||
746 | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); | ||
747 | m7 = | ||
748 | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); | ||
749 | m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); | ||
750 | m7 = _mm_unpacklo_epi8(m7, m7); | ||
751 | m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); | ||
752 | m6 = _mm_unpacklo_epi64(m7, m7); | ||
753 | ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); | ||
754 | ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); | ||
755 | ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); | ||
756 | ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); | ||
757 | } | ||
758 | unsigned char a188, a194; | ||
759 | int a186, a205; | ||
760 | short int s48, s49, s54, s55; | ||
761 | unsigned char *a187, *a193, *b15; | ||
762 | short int *a204, *a206, *a207, *a223, *a224, *b16; | ||
763 | __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215, | ||
764 | *a225, *a226; | ||
765 | __m128i a199, a200, a218, a219; | ||
766 | __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216, | ||
767 | a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45, | ||
768 | m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30; | ||
769 | a184 = ((__m128i*)Y); | ||
770 | s46 = *(a184); | ||
771 | a185 = (a184 + 2); | ||
772 | s47 = *(a185); | ||
773 | a186 = (4 * i9); | ||
774 | b15 = (a186 + syms); | ||
775 | a187 = (b15 + 2); | ||
776 | a188 = *(a187); | ||
777 | a189 = _mm_set1_epi8(a188); | ||
778 | a190 = ((__m128i*)Branchtab); | ||
779 | a191 = *(a190); | ||
780 | a192 = _mm_xor_si128(a189, a191); | ||
781 | a193 = (b15 + 3); | ||
782 | a194 = *(a193); | ||
783 | a195 = _mm_set1_epi8(a194); | ||
784 | a196 = (a190 + 2); | ||
785 | a197 = *(a196); | ||
786 | a198 = _mm_xor_si128(a195, a197); | ||
787 | t25 = _mm_avg_epu8(a192, a198); | ||
788 | a199 = ((__m128i)t25); | ||
789 | a200 = _mm_srli_epi16(a199, 2); | ||
790 | a201 = ((__m128i)a200); | ||
791 | t26 = _mm_and_si128( | ||
792 | a201, | ||
793 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
794 | t27 = _mm_subs_epu8( | ||
795 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
796 | t26); | ||
797 | m39 = _mm_adds_epu8(s46, t26); | ||
798 | m40 = _mm_adds_epu8(s47, t27); | ||
799 | m41 = _mm_adds_epu8(s46, t27); | ||
800 | m42 = _mm_adds_epu8(s47, t26); | ||
801 | a202 = _mm_min_epu8(m40, m39); | ||
802 | d17 = _mm_cmpeq_epi8(a202, m40); | ||
803 | a203 = _mm_min_epu8(m42, m41); | ||
804 | d18 = _mm_cmpeq_epi8(a203, m42); | ||
805 | s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18)); | ||
806 | a204 = ((short int*)dec); | ||
807 | a205 = (8 * i9); | ||
808 | b16 = (a204 + a205); | ||
809 | a206 = (b16 + 4); | ||
810 | *(a206) = s48; | ||
811 | s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18)); | ||
812 | a207 = (b16 + 5); | ||
813 | *(a207) = s49; | ||
814 | s50 = _mm_unpacklo_epi8(a202, a203); | ||
815 | s51 = _mm_unpackhi_epi8(a202, a203); | ||
816 | a208 = ((__m128i*)X); | ||
817 | *(a208) = s50; | ||
818 | a209 = (a208 + 1); | ||
819 | *(a209) = s51; | ||
820 | a210 = (a184 + 1); | ||
821 | s52 = *(a210); | ||
822 | a211 = (a184 + 3); | ||
823 | s53 = *(a211); | ||
824 | a212 = (a190 + 1); | ||
825 | a213 = *(a212); | ||
826 | a214 = _mm_xor_si128(a189, a213); | ||
827 | a215 = (a190 + 3); | ||
828 | a216 = *(a215); | ||
829 | a217 = _mm_xor_si128(a195, a216); | ||
830 | t28 = _mm_avg_epu8(a214, a217); | ||
831 | a218 = ((__m128i)t28); | ||
832 | a219 = _mm_srli_epi16(a218, 2); | ||
833 | a220 = ((__m128i)a219); | ||
834 | t29 = _mm_and_si128( | ||
835 | a220, | ||
836 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
837 | t30 = _mm_subs_epu8( | ||
838 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
839 | t29); | ||
840 | m43 = _mm_adds_epu8(s52, t29); | ||
841 | m44 = _mm_adds_epu8(s53, t30); | ||
842 | m45 = _mm_adds_epu8(s52, t30); | ||
843 | m46 = _mm_adds_epu8(s53, t29); | ||
844 | a221 = _mm_min_epu8(m44, m43); | ||
845 | d19 = _mm_cmpeq_epi8(a221, m44); | ||
846 | a222 = _mm_min_epu8(m46, m45); | ||
847 | d20 = _mm_cmpeq_epi8(a222, m46); | ||
848 | s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20)); | ||
849 | a223 = (b16 + 6); | ||
850 | *(a223) = s54; | ||
851 | s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20)); | ||
852 | a224 = (b16 + 7); | ||
853 | *(a224) = s55; | ||
854 | s56 = _mm_unpacklo_epi8(a221, a222); | ||
855 | s57 = _mm_unpackhi_epi8(a221, a222); | ||
856 | a225 = (a208 + 2); | ||
857 | *(a225) = s56; | ||
858 | a226 = (a208 + 3); | ||
859 | *(a226) = s57; | ||
860 | if ((((unsigned char*)X)[0] > 210)) { | ||
861 | __m128i m12, m13; | ||
862 | m12 = ((__m128i*)X)[0]; | ||
863 | m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); | ||
864 | m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); | ||
865 | m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); | ||
866 | __m128i m14; | ||
867 | m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); | ||
868 | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), | ||
869 | ((__m128i)m14))); | ||
870 | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), | ||
871 | ((__m128i)m14))); | ||
872 | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), | ||
873 | ((__m128i)m14))); | ||
874 | m14 = _mm_unpacklo_epi8(m14, m14); | ||
875 | m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); | ||
876 | m13 = _mm_unpacklo_epi64(m14, m14); | ||
877 | ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); | ||
878 | ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); | ||
879 | ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); | ||
880 | ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); | ||
881 | } | ||
882 | } | ||
883 | |||
884 | renormalize(X, 210); | ||
885 | |||
886 | /*int ch; | ||
887 | for(ch = 0; ch < 64; ch++) { | ||
888 | printf("%d,", X[ch]); | ||
889 | } | ||
890 | printf("\n");*/ | ||
891 | |||
892 | unsigned int j; | ||
893 | for (j = 0; j < (framebits + excess) % 2; ++j) { | ||
894 | int i; | ||
895 | for (i = 0; i < 64 / 2; i++) { | ||
896 | BFLY(i, | ||
897 | (((framebits + excess) >> 1) << 1) + j, | ||
898 | syms, | ||
899 | Y, | ||
900 | X, | ||
901 | (decision_t*)dec, | ||
902 | Branchtab); | ||
903 | } | ||
904 | |||
905 | |||
906 | renormalize(Y, 210); | ||
907 | |||
908 | /*printf("\n"); | ||
909 | for(ch = 0; ch < 64; ch++) { | ||
910 | printf("%d,", Y[ch]); | ||
911 | } | ||
912 | printf("\n");*/ | ||
913 | } | ||
914 | /*skip*/ | ||
915 | } | ||
916 | |||
917 | #endif /*LV_HAVE_NEON*/ | ||
918 | |||
919 | #if LV_HAVE_GENERIC | ||
920 | |||
921 | 2 | static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, | |
922 | unsigned char* X, | ||
923 | unsigned char* syms, | ||
924 | unsigned char* dec, | ||
925 | unsigned int framebits, | ||
926 | unsigned int excess, | ||
927 | unsigned char* Branchtab) | ||
928 | { | ||
929 | 2 | int nbits = framebits + excess; | |
930 | 2 | int NUMSTATES = 64; | |
931 | 2 | int RENORMALIZE_THRESHOLD = 210; | |
932 | |||
933 | int s, i; | ||
934 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (s = 0; s < nbits; s++) { |
935 | void* tmp; | ||
936 |
2/2✓ Branch 0 taken 4194240 times.
✓ Branch 1 taken 131070 times.
|
4325310 | for (i = 0; i < NUMSTATES / 2; i++) { |
937 | 4194240 | BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab); | |
938 | } | ||
939 | |||
940 | 131070 | renormalize(Y, RENORMALIZE_THRESHOLD); | |
941 | |||
942 | /// Swap pointers to old and new metrics | ||
943 | 131070 | tmp = (void*)X; | |
944 | 131070 | X = Y; | |
945 | 131070 | Y = (unsigned char*)tmp; | |
946 | } | ||
947 | 2 | } | |
948 | |||
949 | #endif /* LV_HAVE_GENERIC */ | ||
950 | |||
951 | #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/ | ||
952 |