GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 218 249 87.6%
Functions: 4 4 100.0%
Branches: 23 26 88.5%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_8u_x4_conv_k7_r2_8u
12 *
13 * \b Overview
14 *
15 * Performs convolutional decoding for a K=7, rate 1/2 convolutional
16 * code. The polynomials user defined.
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms,
21 * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char*
22 * Branchtab) \endcode
23 *
24 * \b Inputs
25 * \li X: <FIXME>
26 * \li syms: <FIXME>
27 * \li dec: <FIXME>
28 * \li framebits: size of the frame to decode in bits.
29 * \li excess: <FIXME>
30 * \li Branchtab: <FIXME>
31 *
32 * \b Outputs
33 * \li Y: The decoded output bits.
34 *
35 * \b Example
36 * \code
37 * int N = 10000;
38 *
39 * volk_8u_x4_conv_k7_r2_8u();
40 *
41 * volk_free(x);
42 * \endcode
43 */
44
45 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47
48 typedef union {
49 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50 unsigned int w[64 /*NUMSTATES*/ / 32];
51 unsigned short s[64 /*NUMSTATES*/ / 16];
52 unsigned char c[64 /*NUMSTATES*/ / 8];
53 #ifdef _MSC_VER
54 } decision_t;
55 #else
56 } decision_t __attribute__((aligned(16)));
57 #endif
58
59
60 131074 static inline void renormalize(unsigned char* X, unsigned char threshold)
61 {
62 131074 int NUMSTATES = 64;
63 int i;
64
65 131074 unsigned char min = X[0];
66 // if(min > threshold) {
67
2/2
✓ Branch 0 taken 8388736 times.
✓ Branch 1 taken 131074 times.
8519810 for (i = 0; i < NUMSTATES; i++)
68
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8388736 times.
8388736 if (min > X[i])
69 min = X[i];
70
2/2
✓ Branch 0 taken 8388736 times.
✓ Branch 1 taken 131074 times.
8519810 for (i = 0; i < NUMSTATES; i++)
71 8388736 X[i] -= min;
72 //}
73 131074 }
74
75
76 // helper BFLY for GENERIC version
77 4194304 static inline void BFLY(int i,
78 int s,
79 unsigned char* syms,
80 unsigned char* Y,
81 unsigned char* X,
82 decision_t* d,
83 unsigned char* Branchtab)
84 {
85 int j;
86 unsigned int decision0, decision1;
87 unsigned char metric, m0, m1, m2, m3;
88
89 4194304 int NUMSTATES = 64;
90 4194304 int RATE = 2;
91 4194304 int METRICSHIFT = 2;
92 4194304 int PRECISIONSHIFT = 2;
93
94 4194304 metric = 0;
95
2/2
✓ Branch 0 taken 8388608 times.
✓ Branch 1 taken 4194304 times.
12582912 for (j = 0; j < RATE; j++)
96 8388608 metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
97 4194304 metric = metric >> PRECISIONSHIFT;
98
99 4194304 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
100
101 4194304 m0 = X[i] + metric;
102 4194304 m1 = X[i + NUMSTATES / 2] + (max - metric);
103 4194304 m2 = X[i] + (max - metric);
104 4194304 m3 = X[i + NUMSTATES / 2] + metric;
105
106 4194304 decision0 = (signed int)(m0 - m1) > 0;
107 4194304 decision1 = (signed int)(m2 - m3) > 0;
108
109
2/2
✓ Branch 0 taken 655660 times.
✓ Branch 1 taken 3538644 times.
4194304 Y[2 * i] = decision0 ? m1 : m0;
110
2/2
✓ Branch 0 taken 1179926 times.
✓ Branch 1 taken 3014378 times.
4194304 Y[2 * i + 1] = decision1 ? m3 : m2;
111
112 4194304 d->w[i / (sizeof(unsigned int) * 8 / 2) +
113 4194304 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
114 4194304 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
115 4194304 }
116
117
118 //#if LV_HAVE_AVX2
119 //
120 //#include <immintrin.h>
121 //#include <stdio.h>
122 //
123 // static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
124 // unsigned char* X,
125 // unsigned char* syms,
126 // unsigned char* dec,
127 // unsigned int framebits,
128 // unsigned int excess,
129 // unsigned char* Branchtab)
130 //{
131 // unsigned int i9;
132 // for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
133 // unsigned char a75, a81;
134 // int a73, a92;
135 // int s20, s21;
136 // unsigned char *a80, *b6;
137 // int *a110, *a91, *a93;
138 // __m256i *a112, *a71, *a72, *a77, *a83, *a95;
139 // __m256i a86, a87;
140 // __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
141 // m26,
142 // s18, s19, s22, s23, s24, s25, t13, t14, t15;
143 // a71 = ((__m256i*)X);
144 // s18 = *(a71);
145 // a72 = (a71 + 1);
146 // s19 = *(a72);
147 // s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
148 // s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
149 // s18 = s22;
150 // a73 = (4 * i9);
151 // b6 = (syms + a73);
152 // a75 = *(b6);
153 // a76 = _mm256_set1_epi8(a75);
154 // a77 = ((__m256i*)Branchtab);
155 // a78 = *(a77);
156 // a79 = _mm256_xor_si256(a76, a78);
157 // a80 = (b6 + 1);
158 // a81 = *(a80);
159 // a82 = _mm256_set1_epi8(a81);
160 // a83 = (a77 + 1);
161 // a84 = *(a83);
162 // a85 = _mm256_xor_si256(a82, a84);
163 // t13 = _mm256_avg_epu8(a79, a85);
164 // a86 = ((__m256i)t13);
165 // a87 = _mm256_srli_epi16(a86, 2);
166 // a88 = ((__m256i)a87);
167 // t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
168 // t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
169 // m23 = _mm256_adds_epu8(s18, t14);
170 // m24 = _mm256_adds_epu8(s19, t15);
171 // m25 = _mm256_adds_epu8(s18, t15);
172 // m26 = _mm256_adds_epu8(s19, t14);
173 // a89 = _mm256_min_epu8(m24, m23);
174 // d9 = _mm256_cmpeq_epi8(a89, m24);
175 // a90 = _mm256_min_epu8(m26, m25);
176 // d10 = _mm256_cmpeq_epi8(a90, m26);
177 // s22 = _mm256_unpacklo_epi8(d9, d10);
178 // s23 = _mm256_unpackhi_epi8(d9, d10);
179 // s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
180 // a91 = ((int*)dec);
181 // a92 = (4 * i9);
182 // a93 = (a91 + a92);
183 // *(a93) = s20;
184 // s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
185 // a110 = (a93 + 1);
186 // *(a110) = s21;
187 // s22 = _mm256_unpacklo_epi8(a89, a90);
188 // s23 = _mm256_unpackhi_epi8(a89, a90);
189 // a95 = ((__m256i*)Y);
190 // s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
191 // *(a95) = s24;
192 // s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
193 // a112 = (a95 + 1);
194 // *(a112) = s23;
195 // if ((((unsigned char*)Y)[0] > 210)) {
196 // __m256i m5, m6;
197 // m5 = ((__m256i*)Y)[0];
198 // m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
199 // __m256i m7;
200 // m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
201 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
202 // ((__m256i)m7)));
203 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
204 // ((__m256i)m7)));
205 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
206 // ((__m256i)m7)));
207 // m7 = _mm256_unpacklo_epi8(m7, m7);
208 // m7 = _mm256_shufflelo_epi16(m7, 0);
209 // m6 = _mm256_unpacklo_epi64(m7, m7);
210 // m6 = _mm256_permute2x128_si256(
211 // m6, m6, 0); // copy lower half of m6 to upper half, since above ops
212 // // operate on 128 bit lanes
213 // ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
214 // ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
215 // }
216 // unsigned char a188, a194;
217 // int a205;
218 // int s48, s54;
219 // unsigned char *a187, *a193;
220 // int *a204, *a206, *a223, *b16;
221 // __m256i *a184, *a185, *a190, *a196, *a208, *a225;
222 // __m256i a199, a200;
223 // __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
224 // m40,
225 // m41, m42, s46, s47, s50, s51, t25, t26, t27;
226 // a184 = ((__m256i*)Y);
227 // s46 = *(a184);
228 // a185 = (a184 + 1);
229 // s47 = *(a185);
230 // s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
231 // s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
232 // s46 = s50;
233 // a187 = (b6 + 2);
234 // a188 = *(a187);
235 // a189 = _mm256_set1_epi8(a188);
236 // a190 = ((__m256i*)Branchtab);
237 // a191 = *(a190);
238 // a192 = _mm256_xor_si256(a189, a191);
239 // a193 = (b6 + 3);
240 // a194 = *(a193);
241 // a195 = _mm256_set1_epi8(a194);
242 // a196 = (a190 + 1);
243 // a197 = *(a196);
244 // a198 = _mm256_xor_si256(a195, a197);
245 // t25 = _mm256_avg_epu8(a192, a198);
246 // a199 = ((__m256i)t25);
247 // a200 = _mm256_srli_epi16(a199, 2);
248 // a201 = ((__m256i)a200);
249 // t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
250 // t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
251 // m39 = _mm256_adds_epu8(s46, t26);
252 // m40 = _mm256_adds_epu8(s47, t27);
253 // m41 = _mm256_adds_epu8(s46, t27);
254 // m42 = _mm256_adds_epu8(s47, t26);
255 // a202 = _mm256_min_epu8(m40, m39);
256 // d17 = _mm256_cmpeq_epi8(a202, m40);
257 // a203 = _mm256_min_epu8(m42, m41);
258 // d18 = _mm256_cmpeq_epi8(a203, m42);
259 // s24 = _mm256_unpacklo_epi8(d17, d18);
260 // s25 = _mm256_unpackhi_epi8(d17, d18);
261 // s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
262 // a204 = ((int*)dec);
263 // a205 = (4 * i9);
264 // b16 = (a204 + a205);
265 // a206 = (b16 + 2);
266 // *(a206) = s48;
267 // s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
268 // a223 = (b16 + 3);
269 // *(a223) = s54;
270 // s50 = _mm256_unpacklo_epi8(a202, a203);
271 // s51 = _mm256_unpackhi_epi8(a202, a203);
272 // s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
273 // s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
274 // a208 = ((__m256i*)X);
275 // *(a208) = s25;
276 // a225 = (a208 + 1);
277 // *(a225) = s51;
278 //
279 // if ((((unsigned char*)X)[0] > 210)) {
280 // __m256i m12, m13;
281 // m12 = ((__m256i*)X)[0];
282 // m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
283 // __m256i m14;
284 // m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
285 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
286 // ((__m256i)m14)));
287 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
288 // ((__m256i)m14)));
289 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
290 // ((__m256i)m14)));
291 // m14 = _mm256_unpacklo_epi8(m14, m14);
292 // m14 = _mm256_shufflelo_epi16(m14, 0);
293 // m13 = _mm256_unpacklo_epi64(m14, m14);
294 // m13 = _mm256_permute2x128_si256(m13, m13, 0);
295 // ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
296 // ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
297 // }
298 // }
299 //
300 // renormalize(X, 210);
301 //
302 // unsigned int j;
303 // for (j = 0; j < (framebits + excess) % 2; ++j) {
304 // int i;
305 // for (i = 0; i < 64 / 2; i++) {
306 // BFLY(i,
307 // (((framebits + excess) >> 1) << 1) + j,
308 // syms,
309 // Y,
310 // X,
311 // (decision_t*)dec,
312 // Branchtab);
313 // }
314 //
315 // renormalize(Y, 210);
316 // }
317 // /*skip*/
318 //}
319 //
320 //#endif /*LV_HAVE_AVX2*/
321
322
323 #if LV_HAVE_SSE3
324
325 #include <emmintrin.h>
326 #include <mmintrin.h>
327 #include <pmmintrin.h>
328 #include <stdio.h>
329 #include <xmmintrin.h>
330
331 2 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
332 unsigned char* X,
333 unsigned char* syms,
334 unsigned char* dec,
335 unsigned int framebits,
336 unsigned int excess,
337 unsigned char* Branchtab)
338 {
339 unsigned int i9;
340
2/2
✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
65536 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
341 unsigned char a75, a81;
342 int a73, a92;
343 short int s20, s21, s26, s27;
344 unsigned char *a74, *a80, *b6;
345 short int *a110, *a111, *a91, *a93, *a94;
346 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
347 __m128i a105, a106, a86, a87;
348 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
349 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
350 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
351 65534 a71 = ((__m128i*)X);
352 65534 s18 = *(a71);
353 65534 a72 = (a71 + 2);
354 65534 s19 = *(a72);
355 65534 a73 = (4 * i9);
356 65534 a74 = (syms + a73);
357 65534 a75 = *(a74);
358 65534 a76 = _mm_set1_epi8(a75);
359 65534 a77 = ((__m128i*)Branchtab);
360 65534 a78 = *(a77);
361 65534 a79 = _mm_xor_si128(a76, a78);
362 65534 b6 = (a73 + syms);
363 65534 a80 = (b6 + 1);
364 65534 a81 = *(a80);
365 65534 a82 = _mm_set1_epi8(a81);
366 65534 a83 = (a77 + 2);
367 65534 a84 = *(a83);
368 65534 a85 = _mm_xor_si128(a82, a84);
369 65534 t13 = _mm_avg_epu8(a79, a85);
370 65534 a86 = ((__m128i)t13);
371 65534 a87 = _mm_srli_epi16(a86, 2);
372 65534 a88 = ((__m128i)a87);
373 131068 t14 = _mm_and_si128(
374 a88,
375 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
376 131068 t15 = _mm_subs_epu8(
377 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
378 t14);
379 65534 m23 = _mm_adds_epu8(s18, t14);
380 65534 m24 = _mm_adds_epu8(s19, t15);
381 65534 m25 = _mm_adds_epu8(s18, t15);
382 65534 m26 = _mm_adds_epu8(s19, t14);
383 65534 a89 = _mm_min_epu8(m24, m23);
384 65534 d9 = _mm_cmpeq_epi8(a89, m24);
385 65534 a90 = _mm_min_epu8(m26, m25);
386 65534 d10 = _mm_cmpeq_epi8(a90, m26);
387 65534 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
388 65534 a91 = ((short int*)dec);
389 65534 a92 = (8 * i9);
390 65534 a93 = (a91 + a92);
391 65534 *(a93) = s20;
392 65534 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
393 65534 a94 = (a93 + 1);
394 65534 *(a94) = s21;
395 65534 s22 = _mm_unpacklo_epi8(a89, a90);
396 65534 s23 = _mm_unpackhi_epi8(a89, a90);
397 65534 a95 = ((__m128i*)Y);
398 65534 *(a95) = s22;
399 65534 a96 = (a95 + 1);
400 65534 *(a96) = s23;
401 65534 a97 = (a71 + 1);
402 65534 s24 = *(a97);
403 65534 a98 = (a71 + 3);
404 65534 s25 = *(a98);
405 65534 a99 = (a77 + 1);
406 65534 a100 = *(a99);
407 65534 a101 = _mm_xor_si128(a76, a100);
408 65534 a102 = (a77 + 3);
409 65534 a103 = *(a102);
410 65534 a104 = _mm_xor_si128(a82, a103);
411 65534 t16 = _mm_avg_epu8(a101, a104);
412 65534 a105 = ((__m128i)t16);
413 65534 a106 = _mm_srli_epi16(a105, 2);
414 65534 a107 = ((__m128i)a106);
415 131068 t17 = _mm_and_si128(
416 a107,
417 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
418 131068 t18 = _mm_subs_epu8(
419 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
420 t17);
421 65534 m27 = _mm_adds_epu8(s24, t17);
422 65534 m28 = _mm_adds_epu8(s25, t18);
423 65534 m29 = _mm_adds_epu8(s24, t18);
424 65534 m30 = _mm_adds_epu8(s25, t17);
425 65534 a108 = _mm_min_epu8(m28, m27);
426 65534 d11 = _mm_cmpeq_epi8(a108, m28);
427 65534 a109 = _mm_min_epu8(m30, m29);
428 65534 d12 = _mm_cmpeq_epi8(a109, m30);
429 65534 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
430 65534 a110 = (a93 + 2);
431 65534 *(a110) = s26;
432 65534 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
433 65534 a111 = (a93 + 3);
434 65534 *(a111) = s27;
435 65534 s28 = _mm_unpacklo_epi8(a108, a109);
436 65534 s29 = _mm_unpackhi_epi8(a108, a109);
437 65534 a112 = (a95 + 2);
438 65534 *(a112) = s28;
439 65534 a113 = (a95 + 3);
440 65534 *(a113) = s29;
441
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 65534 times.
65534 if ((((unsigned char*)Y)[0] > 210)) {
442 __m128i m5, m6;
443 m5 = ((__m128i*)Y)[0];
444 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
445 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
446 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
447 __m128i m7;
448 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
449 m7 =
450 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
451 m7 =
452 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
453 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
454 m7 = _mm_unpacklo_epi8(m7, m7);
455 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
456 m6 = _mm_unpacklo_epi64(m7, m7);
457 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
458 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
459 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
460 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
461 }
462 unsigned char a188, a194;
463 int a186, a205;
464 short int s48, s49, s54, s55;
465 unsigned char *a187, *a193, *b15;
466 short int *a204, *a206, *a207, *a223, *a224, *b16;
467 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
468 *a225, *a226;
469 __m128i a199, a200, a218, a219;
470 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
471 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
472 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
473 65534 a184 = ((__m128i*)Y);
474 65534 s46 = *(a184);
475 65534 a185 = (a184 + 2);
476 65534 s47 = *(a185);
477 65534 a186 = (4 * i9);
478 65534 b15 = (a186 + syms);
479 65534 a187 = (b15 + 2);
480 65534 a188 = *(a187);
481 65534 a189 = _mm_set1_epi8(a188);
482 65534 a190 = ((__m128i*)Branchtab);
483 65534 a191 = *(a190);
484 65534 a192 = _mm_xor_si128(a189, a191);
485 65534 a193 = (b15 + 3);
486 65534 a194 = *(a193);
487 65534 a195 = _mm_set1_epi8(a194);
488 65534 a196 = (a190 + 2);
489 65534 a197 = *(a196);
490 65534 a198 = _mm_xor_si128(a195, a197);
491 65534 t25 = _mm_avg_epu8(a192, a198);
492 65534 a199 = ((__m128i)t25);
493 65534 a200 = _mm_srli_epi16(a199, 2);
494 65534 a201 = ((__m128i)a200);
495 131068 t26 = _mm_and_si128(
496 a201,
497 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
498 131068 t27 = _mm_subs_epu8(
499 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
500 t26);
501 65534 m39 = _mm_adds_epu8(s46, t26);
502 65534 m40 = _mm_adds_epu8(s47, t27);
503 65534 m41 = _mm_adds_epu8(s46, t27);
504 65534 m42 = _mm_adds_epu8(s47, t26);
505 65534 a202 = _mm_min_epu8(m40, m39);
506 65534 d17 = _mm_cmpeq_epi8(a202, m40);
507 65534 a203 = _mm_min_epu8(m42, m41);
508 65534 d18 = _mm_cmpeq_epi8(a203, m42);
509 65534 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
510 65534 a204 = ((short int*)dec);
511 65534 a205 = (8 * i9);
512 65534 b16 = (a204 + a205);
513 65534 a206 = (b16 + 4);
514 65534 *(a206) = s48;
515 65534 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
516 65534 a207 = (b16 + 5);
517 65534 *(a207) = s49;
518 65534 s50 = _mm_unpacklo_epi8(a202, a203);
519 65534 s51 = _mm_unpackhi_epi8(a202, a203);
520 65534 a208 = ((__m128i*)X);
521 65534 *(a208) = s50;
522 65534 a209 = (a208 + 1);
523 65534 *(a209) = s51;
524 65534 a210 = (a184 + 1);
525 65534 s52 = *(a210);
526 65534 a211 = (a184 + 3);
527 65534 s53 = *(a211);
528 65534 a212 = (a190 + 1);
529 65534 a213 = *(a212);
530 65534 a214 = _mm_xor_si128(a189, a213);
531 65534 a215 = (a190 + 3);
532 65534 a216 = *(a215);
533 65534 a217 = _mm_xor_si128(a195, a216);
534 65534 t28 = _mm_avg_epu8(a214, a217);
535 65534 a218 = ((__m128i)t28);
536 65534 a219 = _mm_srli_epi16(a218, 2);
537 65534 a220 = ((__m128i)a219);
538 131068 t29 = _mm_and_si128(
539 a220,
540 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
541 131068 t30 = _mm_subs_epu8(
542 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
543 t29);
544 65534 m43 = _mm_adds_epu8(s52, t29);
545 65534 m44 = _mm_adds_epu8(s53, t30);
546 65534 m45 = _mm_adds_epu8(s52, t30);
547 65534 m46 = _mm_adds_epu8(s53, t29);
548 65534 a221 = _mm_min_epu8(m44, m43);
549 65534 d19 = _mm_cmpeq_epi8(a221, m44);
550 65534 a222 = _mm_min_epu8(m46, m45);
551 65534 d20 = _mm_cmpeq_epi8(a222, m46);
552 65534 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
553 65534 a223 = (b16 + 6);
554 65534 *(a223) = s54;
555 65534 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
556 65534 a224 = (b16 + 7);
557 65534 *(a224) = s55;
558 65534 s56 = _mm_unpacklo_epi8(a221, a222);
559 65534 s57 = _mm_unpackhi_epi8(a221, a222);
560 65534 a225 = (a208 + 2);
561 65534 *(a225) = s56;
562 65534 a226 = (a208 + 3);
563 65534 *(a226) = s57;
564
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 65534 times.
65534 if ((((unsigned char*)X)[0] > 210)) {
565 __m128i m12, m13;
566 m12 = ((__m128i*)X)[0];
567 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
568 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
569 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
570 __m128i m14;
571 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
572 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
573 ((__m128i)m14)));
574 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
575 ((__m128i)m14)));
576 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
577 ((__m128i)m14)));
578 m14 = _mm_unpacklo_epi8(m14, m14);
579 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
580 m13 = _mm_unpacklo_epi64(m14, m14);
581 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
582 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
583 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
584 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
585 }
586 }
587
588 2 renormalize(X, 210);
589
590 /*int ch;
591 for(ch = 0; ch < 64; ch++) {
592 printf("%d,", X[ch]);
593 }
594 printf("\n");*/
595
596 unsigned int j;
597
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 for (j = 0; j < (framebits + excess) % 2; ++j) {
598 int i;
599
2/2
✓ Branch 0 taken 64 times.
✓ Branch 1 taken 2 times.
66 for (i = 0; i < 64 / 2; i++) {
600 64 BFLY(i,
601 64 (((framebits + excess) >> 1) << 1) + j,
602 syms,
603 Y,
604 X,
605 (decision_t*)dec,
606 Branchtab);
607 }
608
609
610 2 renormalize(Y, 210);
611
612 /*printf("\n");
613 for(ch = 0; ch < 64; ch++) {
614 printf("%d,", Y[ch]);
615 }
616 printf("\n");*/
617 }
618 /*skip*/
619 2 }
620
621 #endif /*LV_HAVE_SSE3*/
622
623 #if LV_HAVE_NEON
624
625 #include "volk/sse2neon.h"
626
627 static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
628 unsigned char* X,
629 unsigned char* syms,
630 unsigned char* dec,
631 unsigned int framebits,
632 unsigned int excess,
633 unsigned char* Branchtab)
634 {
635 unsigned int i9;
636 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
637 unsigned char a75, a81;
638 int a73, a92;
639 short int s20, s21, s26, s27;
640 unsigned char *a74, *a80, *b6;
641 short int *a110, *a111, *a91, *a93, *a94;
642 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
643 __m128i a105, a106, a86, a87;
644 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
645 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
646 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
647 a71 = ((__m128i*)X);
648 s18 = *(a71);
649 a72 = (a71 + 2);
650 s19 = *(a72);
651 a73 = (4 * i9);
652 a74 = (syms + a73);
653 a75 = *(a74);
654 a76 = _mm_set1_epi8(a75);
655 a77 = ((__m128i*)Branchtab);
656 a78 = *(a77);
657 a79 = _mm_xor_si128(a76, a78);
658 b6 = (a73 + syms);
659 a80 = (b6 + 1);
660 a81 = *(a80);
661 a82 = _mm_set1_epi8(a81);
662 a83 = (a77 + 2);
663 a84 = *(a83);
664 a85 = _mm_xor_si128(a82, a84);
665 t13 = _mm_avg_epu8(a79, a85);
666 a86 = ((__m128i)t13);
667 a87 = _mm_srli_epi16(a86, 2);
668 a88 = ((__m128i)a87);
669 t14 = _mm_and_si128(
670 a88,
671 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
672 t15 = _mm_subs_epu8(
673 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
674 t14);
675 m23 = _mm_adds_epu8(s18, t14);
676 m24 = _mm_adds_epu8(s19, t15);
677 m25 = _mm_adds_epu8(s18, t15);
678 m26 = _mm_adds_epu8(s19, t14);
679 a89 = _mm_min_epu8(m24, m23);
680 d9 = _mm_cmpeq_epi8(a89, m24);
681 a90 = _mm_min_epu8(m26, m25);
682 d10 = _mm_cmpeq_epi8(a90, m26);
683 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
684 a91 = ((short int*)dec);
685 a92 = (8 * i9);
686 a93 = (a91 + a92);
687 *(a93) = s20;
688 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
689 a94 = (a93 + 1);
690 *(a94) = s21;
691 s22 = _mm_unpacklo_epi8(a89, a90);
692 s23 = _mm_unpackhi_epi8(a89, a90);
693 a95 = ((__m128i*)Y);
694 *(a95) = s22;
695 a96 = (a95 + 1);
696 *(a96) = s23;
697 a97 = (a71 + 1);
698 s24 = *(a97);
699 a98 = (a71 + 3);
700 s25 = *(a98);
701 a99 = (a77 + 1);
702 a100 = *(a99);
703 a101 = _mm_xor_si128(a76, a100);
704 a102 = (a77 + 3);
705 a103 = *(a102);
706 a104 = _mm_xor_si128(a82, a103);
707 t16 = _mm_avg_epu8(a101, a104);
708 a105 = ((__m128i)t16);
709 a106 = _mm_srli_epi16(a105, 2);
710 a107 = ((__m128i)a106);
711 t17 = _mm_and_si128(
712 a107,
713 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
714 t18 = _mm_subs_epu8(
715 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
716 t17);
717 m27 = _mm_adds_epu8(s24, t17);
718 m28 = _mm_adds_epu8(s25, t18);
719 m29 = _mm_adds_epu8(s24, t18);
720 m30 = _mm_adds_epu8(s25, t17);
721 a108 = _mm_min_epu8(m28, m27);
722 d11 = _mm_cmpeq_epi8(a108, m28);
723 a109 = _mm_min_epu8(m30, m29);
724 d12 = _mm_cmpeq_epi8(a109, m30);
725 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
726 a110 = (a93 + 2);
727 *(a110) = s26;
728 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
729 a111 = (a93 + 3);
730 *(a111) = s27;
731 s28 = _mm_unpacklo_epi8(a108, a109);
732 s29 = _mm_unpackhi_epi8(a108, a109);
733 a112 = (a95 + 2);
734 *(a112) = s28;
735 a113 = (a95 + 3);
736 *(a113) = s29;
737 if ((((unsigned char*)Y)[0] > 210)) {
738 __m128i m5, m6;
739 m5 = ((__m128i*)Y)[0];
740 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
741 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
742 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
743 __m128i m7;
744 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
745 m7 =
746 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
747 m7 =
748 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
749 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
750 m7 = _mm_unpacklo_epi8(m7, m7);
751 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
752 m6 = _mm_unpacklo_epi64(m7, m7);
753 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
754 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
755 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
756 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
757 }
758 unsigned char a188, a194;
759 int a186, a205;
760 short int s48, s49, s54, s55;
761 unsigned char *a187, *a193, *b15;
762 short int *a204, *a206, *a207, *a223, *a224, *b16;
763 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
764 *a225, *a226;
765 __m128i a199, a200, a218, a219;
766 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
767 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
768 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
769 a184 = ((__m128i*)Y);
770 s46 = *(a184);
771 a185 = (a184 + 2);
772 s47 = *(a185);
773 a186 = (4 * i9);
774 b15 = (a186 + syms);
775 a187 = (b15 + 2);
776 a188 = *(a187);
777 a189 = _mm_set1_epi8(a188);
778 a190 = ((__m128i*)Branchtab);
779 a191 = *(a190);
780 a192 = _mm_xor_si128(a189, a191);
781 a193 = (b15 + 3);
782 a194 = *(a193);
783 a195 = _mm_set1_epi8(a194);
784 a196 = (a190 + 2);
785 a197 = *(a196);
786 a198 = _mm_xor_si128(a195, a197);
787 t25 = _mm_avg_epu8(a192, a198);
788 a199 = ((__m128i)t25);
789 a200 = _mm_srli_epi16(a199, 2);
790 a201 = ((__m128i)a200);
791 t26 = _mm_and_si128(
792 a201,
793 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
794 t27 = _mm_subs_epu8(
795 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
796 t26);
797 m39 = _mm_adds_epu8(s46, t26);
798 m40 = _mm_adds_epu8(s47, t27);
799 m41 = _mm_adds_epu8(s46, t27);
800 m42 = _mm_adds_epu8(s47, t26);
801 a202 = _mm_min_epu8(m40, m39);
802 d17 = _mm_cmpeq_epi8(a202, m40);
803 a203 = _mm_min_epu8(m42, m41);
804 d18 = _mm_cmpeq_epi8(a203, m42);
805 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
806 a204 = ((short int*)dec);
807 a205 = (8 * i9);
808 b16 = (a204 + a205);
809 a206 = (b16 + 4);
810 *(a206) = s48;
811 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
812 a207 = (b16 + 5);
813 *(a207) = s49;
814 s50 = _mm_unpacklo_epi8(a202, a203);
815 s51 = _mm_unpackhi_epi8(a202, a203);
816 a208 = ((__m128i*)X);
817 *(a208) = s50;
818 a209 = (a208 + 1);
819 *(a209) = s51;
820 a210 = (a184 + 1);
821 s52 = *(a210);
822 a211 = (a184 + 3);
823 s53 = *(a211);
824 a212 = (a190 + 1);
825 a213 = *(a212);
826 a214 = _mm_xor_si128(a189, a213);
827 a215 = (a190 + 3);
828 a216 = *(a215);
829 a217 = _mm_xor_si128(a195, a216);
830 t28 = _mm_avg_epu8(a214, a217);
831 a218 = ((__m128i)t28);
832 a219 = _mm_srli_epi16(a218, 2);
833 a220 = ((__m128i)a219);
834 t29 = _mm_and_si128(
835 a220,
836 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
837 t30 = _mm_subs_epu8(
838 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
839 t29);
840 m43 = _mm_adds_epu8(s52, t29);
841 m44 = _mm_adds_epu8(s53, t30);
842 m45 = _mm_adds_epu8(s52, t30);
843 m46 = _mm_adds_epu8(s53, t29);
844 a221 = _mm_min_epu8(m44, m43);
845 d19 = _mm_cmpeq_epi8(a221, m44);
846 a222 = _mm_min_epu8(m46, m45);
847 d20 = _mm_cmpeq_epi8(a222, m46);
848 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
849 a223 = (b16 + 6);
850 *(a223) = s54;
851 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
852 a224 = (b16 + 7);
853 *(a224) = s55;
854 s56 = _mm_unpacklo_epi8(a221, a222);
855 s57 = _mm_unpackhi_epi8(a221, a222);
856 a225 = (a208 + 2);
857 *(a225) = s56;
858 a226 = (a208 + 3);
859 *(a226) = s57;
860 if ((((unsigned char*)X)[0] > 210)) {
861 __m128i m12, m13;
862 m12 = ((__m128i*)X)[0];
863 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
864 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
865 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
866 __m128i m14;
867 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
868 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
869 ((__m128i)m14)));
870 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
871 ((__m128i)m14)));
872 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
873 ((__m128i)m14)));
874 m14 = _mm_unpacklo_epi8(m14, m14);
875 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
876 m13 = _mm_unpacklo_epi64(m14, m14);
877 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
878 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
879 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
880 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
881 }
882 }
883
884 renormalize(X, 210);
885
886 /*int ch;
887 for(ch = 0; ch < 64; ch++) {
888 printf("%d,", X[ch]);
889 }
890 printf("\n");*/
891
892 unsigned int j;
893 for (j = 0; j < (framebits + excess) % 2; ++j) {
894 int i;
895 for (i = 0; i < 64 / 2; i++) {
896 BFLY(i,
897 (((framebits + excess) >> 1) << 1) + j,
898 syms,
899 Y,
900 X,
901 (decision_t*)dec,
902 Branchtab);
903 }
904
905
906 renormalize(Y, 210);
907
908 /*printf("\n");
909 for(ch = 0; ch < 64; ch++) {
910 printf("%d,", Y[ch]);
911 }
912 printf("\n");*/
913 }
914 /*skip*/
915 }
916
917 #endif /*LV_HAVE_NEON*/
918
919 #if LV_HAVE_GENERIC
920
921 2 static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
922 unsigned char* X,
923 unsigned char* syms,
924 unsigned char* dec,
925 unsigned int framebits,
926 unsigned int excess,
927 unsigned char* Branchtab)
928 {
929 2 int nbits = framebits + excess;
930 2 int NUMSTATES = 64;
931 2 int RENORMALIZE_THRESHOLD = 210;
932
933 int s, i;
934
2/2
✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
131072 for (s = 0; s < nbits; s++) {
935 void* tmp;
936
2/2
✓ Branch 0 taken 4194240 times.
✓ Branch 1 taken 131070 times.
4325310 for (i = 0; i < NUMSTATES / 2; i++) {
937 4194240 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
938 }
939
940 131070 renormalize(Y, RENORMALIZE_THRESHOLD);
941
942 /// Swap pointers to old and new metrics
943 131070 tmp = (void*)X;
944 131070 X = Y;
945 131070 Y = (unsigned char*)tmp;
946 }
947 2 }
948
949 #endif /* LV_HAVE_GENERIC */
950
951 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
952