| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_8u_x4_conv_k7_r2_8u | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * Performs convolutional decoding for a K=7, rate 1/2 convolutional | ||
| 16 | * code. The polynomials user defined. | ||
| 17 | * | ||
| 18 | * <b>Dispatcher Prototype</b> | ||
| 19 | * \code | ||
| 20 | * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, | ||
| 21 | * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* | ||
| 22 | * Branchtab) \endcode | ||
| 23 | * | ||
| 24 | * \b Inputs | ||
| 25 | * \li X: <FIXME> | ||
| 26 | * \li syms: <FIXME> | ||
| 27 | * \li dec: <FIXME> | ||
| 28 | * \li framebits: size of the frame to decode in bits. | ||
| 29 | * \li excess: <FIXME> | ||
| 30 | * \li Branchtab: <FIXME> | ||
| 31 | * | ||
| 32 | * \b Outputs | ||
| 33 | * \li Y: The decoded output bits. | ||
| 34 | * | ||
| 35 | * \b Example | ||
| 36 | * \code | ||
| 37 | * int N = 10000; | ||
| 38 | * | ||
| 39 | * volk_8u_x4_conv_k7_r2_8u(); | ||
| 40 | * | ||
| 41 | * volk_free(x); | ||
| 42 | * \endcode | ||
| 43 | */ | ||
| 44 | |||
| 45 | #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H | ||
| 46 | #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H | ||
| 47 | |||
| 48 | typedef union { | ||
| 49 | unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/]; | ||
| 50 | unsigned int w[64 /*NUMSTATES*/ / 32]; | ||
| 51 | unsigned short s[64 /*NUMSTATES*/ / 16]; | ||
| 52 | unsigned char c[64 /*NUMSTATES*/ / 8]; | ||
| 53 | #ifdef _MSC_VER | ||
| 54 | } decision_t; | ||
| 55 | #else | ||
| 56 | } decision_t __attribute__((aligned(16))); | ||
| 57 | #endif | ||
| 58 | |||
| 59 | |||
| 60 | 131074 | static inline void renormalize(unsigned char* X, unsigned char threshold) | |
| 61 | { | ||
| 62 | 131074 | int NUMSTATES = 64; | |
| 63 | int i; | ||
| 64 | |||
| 65 | 131074 | unsigned char min = X[0]; | |
| 66 | // if(min > threshold) { | ||
| 67 |
2/2✓ Branch 0 taken 8388736 times.
✓ Branch 1 taken 131074 times.
|
8519810 | for (i = 0; i < NUMSTATES; i++) |
| 68 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 8388736 times.
|
8388736 | if (min > X[i]) |
| 69 | ✗ | min = X[i]; | |
| 70 |
2/2✓ Branch 0 taken 8388736 times.
✓ Branch 1 taken 131074 times.
|
8519810 | for (i = 0; i < NUMSTATES; i++) |
| 71 | 8388736 | X[i] -= min; | |
| 72 | //} | ||
| 73 | 131074 | } | |
| 74 | |||
| 75 | |||
| 76 | // helper BFLY for GENERIC version | ||
| 77 | 4194304 | static inline void BFLY(int i, | |
| 78 | int s, | ||
| 79 | unsigned char* syms, | ||
| 80 | unsigned char* Y, | ||
| 81 | unsigned char* X, | ||
| 82 | decision_t* d, | ||
| 83 | unsigned char* Branchtab) | ||
| 84 | { | ||
| 85 | int j; | ||
| 86 | unsigned int decision0, decision1; | ||
| 87 | unsigned char metric, m0, m1, m2, m3; | ||
| 88 | |||
| 89 | 4194304 | int NUMSTATES = 64; | |
| 90 | 4194304 | int RATE = 2; | |
| 91 | 4194304 | int METRICSHIFT = 2; | |
| 92 | 4194304 | int PRECISIONSHIFT = 2; | |
| 93 | |||
| 94 | 4194304 | metric = 0; | |
| 95 |
2/2✓ Branch 0 taken 8388608 times.
✓ Branch 1 taken 4194304 times.
|
12582912 | for (j = 0; j < RATE; j++) |
| 96 | 8388608 | metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT; | |
| 97 | 4194304 | metric = metric >> PRECISIONSHIFT; | |
| 98 | |||
| 99 | 4194304 | unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); | |
| 100 | |||
| 101 | 4194304 | m0 = X[i] + metric; | |
| 102 | 4194304 | m1 = X[i + NUMSTATES / 2] + (max - metric); | |
| 103 | 4194304 | m2 = X[i] + (max - metric); | |
| 104 | 4194304 | m3 = X[i + NUMSTATES / 2] + metric; | |
| 105 | |||
| 106 | 4194304 | decision0 = (signed int)(m0 - m1) > 0; | |
| 107 | 4194304 | decision1 = (signed int)(m2 - m3) > 0; | |
| 108 | |||
| 109 |
2/2✓ Branch 0 taken 655660 times.
✓ Branch 1 taken 3538644 times.
|
4194304 | Y[2 * i] = decision0 ? m1 : m0; |
| 110 |
2/2✓ Branch 0 taken 1179926 times.
✓ Branch 1 taken 3014378 times.
|
4194304 | Y[2 * i + 1] = decision1 ? m3 : m2; |
| 111 | |||
| 112 | 4194304 | d->w[i / (sizeof(unsigned int) * 8 / 2) + | |
| 113 | 4194304 | s * (sizeof(decision_t) / sizeof(unsigned int))] |= | |
| 114 | 4194304 | (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1)); | |
| 115 | 4194304 | } | |
| 116 | |||
| 117 | |||
| 118 | //#if LV_HAVE_AVX2 | ||
| 119 | // | ||
| 120 | //#include <immintrin.h> | ||
| 121 | //#include <stdio.h> | ||
| 122 | // | ||
| 123 | // static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, | ||
| 124 | // unsigned char* X, | ||
| 125 | // unsigned char* syms, | ||
| 126 | // unsigned char* dec, | ||
| 127 | // unsigned int framebits, | ||
| 128 | // unsigned int excess, | ||
| 129 | // unsigned char* Branchtab) | ||
| 130 | //{ | ||
| 131 | // unsigned int i9; | ||
| 132 | // for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { | ||
| 133 | // unsigned char a75, a81; | ||
| 134 | // int a73, a92; | ||
| 135 | // int s20, s21; | ||
| 136 | // unsigned char *a80, *b6; | ||
| 137 | // int *a110, *a91, *a93; | ||
| 138 | // __m256i *a112, *a71, *a72, *a77, *a83, *a95; | ||
| 139 | // __m256i a86, a87; | ||
| 140 | // __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, | ||
| 141 | // m26, | ||
| 142 | // s18, s19, s22, s23, s24, s25, t13, t14, t15; | ||
| 143 | // a71 = ((__m256i*)X); | ||
| 144 | // s18 = *(a71); | ||
| 145 | // a72 = (a71 + 1); | ||
| 146 | // s19 = *(a72); | ||
| 147 | // s22 = _mm256_permute2x128_si256(s18, s19, 0x20); | ||
| 148 | // s19 = _mm256_permute2x128_si256(s18, s19, 0x31); | ||
| 149 | // s18 = s22; | ||
| 150 | // a73 = (4 * i9); | ||
| 151 | // b6 = (syms + a73); | ||
| 152 | // a75 = *(b6); | ||
| 153 | // a76 = _mm256_set1_epi8(a75); | ||
| 154 | // a77 = ((__m256i*)Branchtab); | ||
| 155 | // a78 = *(a77); | ||
| 156 | // a79 = _mm256_xor_si256(a76, a78); | ||
| 157 | // a80 = (b6 + 1); | ||
| 158 | // a81 = *(a80); | ||
| 159 | // a82 = _mm256_set1_epi8(a81); | ||
| 160 | // a83 = (a77 + 1); | ||
| 161 | // a84 = *(a83); | ||
| 162 | // a85 = _mm256_xor_si256(a82, a84); | ||
| 163 | // t13 = _mm256_avg_epu8(a79, a85); | ||
| 164 | // a86 = ((__m256i)t13); | ||
| 165 | // a87 = _mm256_srli_epi16(a86, 2); | ||
| 166 | // a88 = ((__m256i)a87); | ||
| 167 | // t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); | ||
| 168 | // t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); | ||
| 169 | // m23 = _mm256_adds_epu8(s18, t14); | ||
| 170 | // m24 = _mm256_adds_epu8(s19, t15); | ||
| 171 | // m25 = _mm256_adds_epu8(s18, t15); | ||
| 172 | // m26 = _mm256_adds_epu8(s19, t14); | ||
| 173 | // a89 = _mm256_min_epu8(m24, m23); | ||
| 174 | // d9 = _mm256_cmpeq_epi8(a89, m24); | ||
| 175 | // a90 = _mm256_min_epu8(m26, m25); | ||
| 176 | // d10 = _mm256_cmpeq_epi8(a90, m26); | ||
| 177 | // s22 = _mm256_unpacklo_epi8(d9, d10); | ||
| 178 | // s23 = _mm256_unpackhi_epi8(d9, d10); | ||
| 179 | // s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); | ||
| 180 | // a91 = ((int*)dec); | ||
| 181 | // a92 = (4 * i9); | ||
| 182 | // a93 = (a91 + a92); | ||
| 183 | // *(a93) = s20; | ||
| 184 | // s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); | ||
| 185 | // a110 = (a93 + 1); | ||
| 186 | // *(a110) = s21; | ||
| 187 | // s22 = _mm256_unpacklo_epi8(a89, a90); | ||
| 188 | // s23 = _mm256_unpackhi_epi8(a89, a90); | ||
| 189 | // a95 = ((__m256i*)Y); | ||
| 190 | // s24 = _mm256_permute2x128_si256(s22, s23, 0x20); | ||
| 191 | // *(a95) = s24; | ||
| 192 | // s23 = _mm256_permute2x128_si256(s22, s23, 0x31); | ||
| 193 | // a112 = (a95 + 1); | ||
| 194 | // *(a112) = s23; | ||
| 195 | // if ((((unsigned char*)Y)[0] > 210)) { | ||
| 196 | // __m256i m5, m6; | ||
| 197 | // m5 = ((__m256i*)Y)[0]; | ||
| 198 | // m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]); | ||
| 199 | // __m256i m7; | ||
| 200 | // m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); | ||
| 201 | // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)), | ||
| 202 | // ((__m256i)m7))); | ||
| 203 | // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)), | ||
| 204 | // ((__m256i)m7))); | ||
| 205 | // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)), | ||
| 206 | // ((__m256i)m7))); | ||
| 207 | // m7 = _mm256_unpacklo_epi8(m7, m7); | ||
| 208 | // m7 = _mm256_shufflelo_epi16(m7, 0); | ||
| 209 | // m6 = _mm256_unpacklo_epi64(m7, m7); | ||
| 210 | // m6 = _mm256_permute2x128_si256( | ||
| 211 | // m6, m6, 0); // copy lower half of m6 to upper half, since above ops | ||
| 212 | // // operate on 128 bit lanes | ||
| 213 | // ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6); | ||
| 214 | // ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6); | ||
| 215 | // } | ||
| 216 | // unsigned char a188, a194; | ||
| 217 | // int a205; | ||
| 218 | // int s48, s54; | ||
| 219 | // unsigned char *a187, *a193; | ||
| 220 | // int *a204, *a206, *a223, *b16; | ||
| 221 | // __m256i *a184, *a185, *a190, *a196, *a208, *a225; | ||
| 222 | // __m256i a199, a200; | ||
| 223 | // __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, | ||
| 224 | // m40, | ||
| 225 | // m41, m42, s46, s47, s50, s51, t25, t26, t27; | ||
| 226 | // a184 = ((__m256i*)Y); | ||
| 227 | // s46 = *(a184); | ||
| 228 | // a185 = (a184 + 1); | ||
| 229 | // s47 = *(a185); | ||
| 230 | // s50 = _mm256_permute2x128_si256(s46, s47, 0x20); | ||
| 231 | // s47 = _mm256_permute2x128_si256(s46, s47, 0x31); | ||
| 232 | // s46 = s50; | ||
| 233 | // a187 = (b6 + 2); | ||
| 234 | // a188 = *(a187); | ||
| 235 | // a189 = _mm256_set1_epi8(a188); | ||
| 236 | // a190 = ((__m256i*)Branchtab); | ||
| 237 | // a191 = *(a190); | ||
| 238 | // a192 = _mm256_xor_si256(a189, a191); | ||
| 239 | // a193 = (b6 + 3); | ||
| 240 | // a194 = *(a193); | ||
| 241 | // a195 = _mm256_set1_epi8(a194); | ||
| 242 | // a196 = (a190 + 1); | ||
| 243 | // a197 = *(a196); | ||
| 244 | // a198 = _mm256_xor_si256(a195, a197); | ||
| 245 | // t25 = _mm256_avg_epu8(a192, a198); | ||
| 246 | // a199 = ((__m256i)t25); | ||
| 247 | // a200 = _mm256_srli_epi16(a199, 2); | ||
| 248 | // a201 = ((__m256i)a200); | ||
| 249 | // t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); | ||
| 250 | // t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); | ||
| 251 | // m39 = _mm256_adds_epu8(s46, t26); | ||
| 252 | // m40 = _mm256_adds_epu8(s47, t27); | ||
| 253 | // m41 = _mm256_adds_epu8(s46, t27); | ||
| 254 | // m42 = _mm256_adds_epu8(s47, t26); | ||
| 255 | // a202 = _mm256_min_epu8(m40, m39); | ||
| 256 | // d17 = _mm256_cmpeq_epi8(a202, m40); | ||
| 257 | // a203 = _mm256_min_epu8(m42, m41); | ||
| 258 | // d18 = _mm256_cmpeq_epi8(a203, m42); | ||
| 259 | // s24 = _mm256_unpacklo_epi8(d17, d18); | ||
| 260 | // s25 = _mm256_unpackhi_epi8(d17, d18); | ||
| 261 | // s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); | ||
| 262 | // a204 = ((int*)dec); | ||
| 263 | // a205 = (4 * i9); | ||
| 264 | // b16 = (a204 + a205); | ||
| 265 | // a206 = (b16 + 2); | ||
| 266 | // *(a206) = s48; | ||
| 267 | // s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); | ||
| 268 | // a223 = (b16 + 3); | ||
| 269 | // *(a223) = s54; | ||
| 270 | // s50 = _mm256_unpacklo_epi8(a202, a203); | ||
| 271 | // s51 = _mm256_unpackhi_epi8(a202, a203); | ||
| 272 | // s25 = _mm256_permute2x128_si256(s50, s51, 0x20); | ||
| 273 | // s51 = _mm256_permute2x128_si256(s50, s51, 0x31); | ||
| 274 | // a208 = ((__m256i*)X); | ||
| 275 | // *(a208) = s25; | ||
| 276 | // a225 = (a208 + 1); | ||
| 277 | // *(a225) = s51; | ||
| 278 | // | ||
| 279 | // if ((((unsigned char*)X)[0] > 210)) { | ||
| 280 | // __m256i m12, m13; | ||
| 281 | // m12 = ((__m256i*)X)[0]; | ||
| 282 | // m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]); | ||
| 283 | // __m256i m14; | ||
| 284 | // m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); | ||
| 285 | // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)), | ||
| 286 | // ((__m256i)m14))); | ||
| 287 | // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)), | ||
| 288 | // ((__m256i)m14))); | ||
| 289 | // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)), | ||
| 290 | // ((__m256i)m14))); | ||
| 291 | // m14 = _mm256_unpacklo_epi8(m14, m14); | ||
| 292 | // m14 = _mm256_shufflelo_epi16(m14, 0); | ||
| 293 | // m13 = _mm256_unpacklo_epi64(m14, m14); | ||
| 294 | // m13 = _mm256_permute2x128_si256(m13, m13, 0); | ||
| 295 | // ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13); | ||
| 296 | // ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13); | ||
| 297 | // } | ||
| 298 | // } | ||
| 299 | // | ||
| 300 | // renormalize(X, 210); | ||
| 301 | // | ||
| 302 | // unsigned int j; | ||
| 303 | // for (j = 0; j < (framebits + excess) % 2; ++j) { | ||
| 304 | // int i; | ||
| 305 | // for (i = 0; i < 64 / 2; i++) { | ||
| 306 | // BFLY(i, | ||
| 307 | // (((framebits + excess) >> 1) << 1) + j, | ||
| 308 | // syms, | ||
| 309 | // Y, | ||
| 310 | // X, | ||
| 311 | // (decision_t*)dec, | ||
| 312 | // Branchtab); | ||
| 313 | // } | ||
| 314 | // | ||
| 315 | // renormalize(Y, 210); | ||
| 316 | // } | ||
| 317 | // /*skip*/ | ||
| 318 | //} | ||
| 319 | // | ||
| 320 | //#endif /*LV_HAVE_AVX2*/ | ||
| 321 | |||
| 322 | |||
| 323 | #if LV_HAVE_SSE3 | ||
| 324 | |||
| 325 | #include <emmintrin.h> | ||
| 326 | #include <mmintrin.h> | ||
| 327 | #include <pmmintrin.h> | ||
| 328 | #include <stdio.h> | ||
| 329 | #include <xmmintrin.h> | ||
| 330 | |||
| 331 | 2 | static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, | |
| 332 | unsigned char* X, | ||
| 333 | unsigned char* syms, | ||
| 334 | unsigned char* dec, | ||
| 335 | unsigned int framebits, | ||
| 336 | unsigned int excess, | ||
| 337 | unsigned char* Branchtab) | ||
| 338 | { | ||
| 339 | unsigned int i9; | ||
| 340 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { |
| 341 | unsigned char a75, a81; | ||
| 342 | int a73, a92; | ||
| 343 | short int s20, s21, s26, s27; | ||
| 344 | unsigned char *a74, *a80, *b6; | ||
| 345 | short int *a110, *a111, *a91, *a93, *a94; | ||
| 346 | __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99; | ||
| 347 | __m128i a105, a106, a86, a87; | ||
| 348 | __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85, | ||
| 349 | a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18, | ||
| 350 | s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18; | ||
| 351 | 65534 | a71 = ((__m128i*)X); | |
| 352 | 65534 | s18 = *(a71); | |
| 353 | 65534 | a72 = (a71 + 2); | |
| 354 | 65534 | s19 = *(a72); | |
| 355 | 65534 | a73 = (4 * i9); | |
| 356 | 65534 | a74 = (syms + a73); | |
| 357 | 65534 | a75 = *(a74); | |
| 358 | 65534 | a76 = _mm_set1_epi8(a75); | |
| 359 | 65534 | a77 = ((__m128i*)Branchtab); | |
| 360 | 65534 | a78 = *(a77); | |
| 361 | 65534 | a79 = _mm_xor_si128(a76, a78); | |
| 362 | 65534 | b6 = (a73 + syms); | |
| 363 | 65534 | a80 = (b6 + 1); | |
| 364 | 65534 | a81 = *(a80); | |
| 365 | 65534 | a82 = _mm_set1_epi8(a81); | |
| 366 | 65534 | a83 = (a77 + 2); | |
| 367 | 65534 | a84 = *(a83); | |
| 368 | 65534 | a85 = _mm_xor_si128(a82, a84); | |
| 369 | 65534 | t13 = _mm_avg_epu8(a79, a85); | |
| 370 | 65534 | a86 = ((__m128i)t13); | |
| 371 | 65534 | a87 = _mm_srli_epi16(a86, 2); | |
| 372 | 65534 | a88 = ((__m128i)a87); | |
| 373 | 131068 | t14 = _mm_and_si128( | |
| 374 | a88, | ||
| 375 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 376 | 131068 | t15 = _mm_subs_epu8( | |
| 377 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 378 | t14); | ||
| 379 | 65534 | m23 = _mm_adds_epu8(s18, t14); | |
| 380 | 65534 | m24 = _mm_adds_epu8(s19, t15); | |
| 381 | 65534 | m25 = _mm_adds_epu8(s18, t15); | |
| 382 | 65534 | m26 = _mm_adds_epu8(s19, t14); | |
| 383 | 65534 | a89 = _mm_min_epu8(m24, m23); | |
| 384 | 65534 | d9 = _mm_cmpeq_epi8(a89, m24); | |
| 385 | 65534 | a90 = _mm_min_epu8(m26, m25); | |
| 386 | 65534 | d10 = _mm_cmpeq_epi8(a90, m26); | |
| 387 | 65534 | s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10)); | |
| 388 | 65534 | a91 = ((short int*)dec); | |
| 389 | 65534 | a92 = (8 * i9); | |
| 390 | 65534 | a93 = (a91 + a92); | |
| 391 | 65534 | *(a93) = s20; | |
| 392 | 65534 | s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10)); | |
| 393 | 65534 | a94 = (a93 + 1); | |
| 394 | 65534 | *(a94) = s21; | |
| 395 | 65534 | s22 = _mm_unpacklo_epi8(a89, a90); | |
| 396 | 65534 | s23 = _mm_unpackhi_epi8(a89, a90); | |
| 397 | 65534 | a95 = ((__m128i*)Y); | |
| 398 | 65534 | *(a95) = s22; | |
| 399 | 65534 | a96 = (a95 + 1); | |
| 400 | 65534 | *(a96) = s23; | |
| 401 | 65534 | a97 = (a71 + 1); | |
| 402 | 65534 | s24 = *(a97); | |
| 403 | 65534 | a98 = (a71 + 3); | |
| 404 | 65534 | s25 = *(a98); | |
| 405 | 65534 | a99 = (a77 + 1); | |
| 406 | 65534 | a100 = *(a99); | |
| 407 | 65534 | a101 = _mm_xor_si128(a76, a100); | |
| 408 | 65534 | a102 = (a77 + 3); | |
| 409 | 65534 | a103 = *(a102); | |
| 410 | 65534 | a104 = _mm_xor_si128(a82, a103); | |
| 411 | 65534 | t16 = _mm_avg_epu8(a101, a104); | |
| 412 | 65534 | a105 = ((__m128i)t16); | |
| 413 | 65534 | a106 = _mm_srli_epi16(a105, 2); | |
| 414 | 65534 | a107 = ((__m128i)a106); | |
| 415 | 131068 | t17 = _mm_and_si128( | |
| 416 | a107, | ||
| 417 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 418 | 131068 | t18 = _mm_subs_epu8( | |
| 419 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 420 | t17); | ||
| 421 | 65534 | m27 = _mm_adds_epu8(s24, t17); | |
| 422 | 65534 | m28 = _mm_adds_epu8(s25, t18); | |
| 423 | 65534 | m29 = _mm_adds_epu8(s24, t18); | |
| 424 | 65534 | m30 = _mm_adds_epu8(s25, t17); | |
| 425 | 65534 | a108 = _mm_min_epu8(m28, m27); | |
| 426 | 65534 | d11 = _mm_cmpeq_epi8(a108, m28); | |
| 427 | 65534 | a109 = _mm_min_epu8(m30, m29); | |
| 428 | 65534 | d12 = _mm_cmpeq_epi8(a109, m30); | |
| 429 | 65534 | s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12)); | |
| 430 | 65534 | a110 = (a93 + 2); | |
| 431 | 65534 | *(a110) = s26; | |
| 432 | 65534 | s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12)); | |
| 433 | 65534 | a111 = (a93 + 3); | |
| 434 | 65534 | *(a111) = s27; | |
| 435 | 65534 | s28 = _mm_unpacklo_epi8(a108, a109); | |
| 436 | 65534 | s29 = _mm_unpackhi_epi8(a108, a109); | |
| 437 | 65534 | a112 = (a95 + 2); | |
| 438 | 65534 | *(a112) = s28; | |
| 439 | 65534 | a113 = (a95 + 3); | |
| 440 | 65534 | *(a113) = s29; | |
| 441 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 65534 times.
|
65534 | if ((((unsigned char*)Y)[0] > 210)) { |
| 442 | __m128i m5, m6; | ||
| 443 | ✗ | m5 = ((__m128i*)Y)[0]; | |
| 444 | ✗ | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); | |
| 445 | ✗ | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); | |
| 446 | ✗ | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); | |
| 447 | __m128i m7; | ||
| 448 | ✗ | m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); | |
| 449 | m7 = | ||
| 450 | ✗ | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); | |
| 451 | m7 = | ||
| 452 | ✗ | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); | |
| 453 | ✗ | m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); | |
| 454 | ✗ | m7 = _mm_unpacklo_epi8(m7, m7); | |
| 455 | ✗ | m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); | |
| 456 | ✗ | m6 = _mm_unpacklo_epi64(m7, m7); | |
| 457 | ✗ | ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); | |
| 458 | ✗ | ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); | |
| 459 | ✗ | ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); | |
| 460 | ✗ | ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); | |
| 461 | } | ||
| 462 | unsigned char a188, a194; | ||
| 463 | int a186, a205; | ||
| 464 | short int s48, s49, s54, s55; | ||
| 465 | unsigned char *a187, *a193, *b15; | ||
| 466 | short int *a204, *a206, *a207, *a223, *a224, *b16; | ||
| 467 | __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215, | ||
| 468 | *a225, *a226; | ||
| 469 | __m128i a199, a200, a218, a219; | ||
| 470 | __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216, | ||
| 471 | a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45, | ||
| 472 | m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30; | ||
| 473 | 65534 | a184 = ((__m128i*)Y); | |
| 474 | 65534 | s46 = *(a184); | |
| 475 | 65534 | a185 = (a184 + 2); | |
| 476 | 65534 | s47 = *(a185); | |
| 477 | 65534 | a186 = (4 * i9); | |
| 478 | 65534 | b15 = (a186 + syms); | |
| 479 | 65534 | a187 = (b15 + 2); | |
| 480 | 65534 | a188 = *(a187); | |
| 481 | 65534 | a189 = _mm_set1_epi8(a188); | |
| 482 | 65534 | a190 = ((__m128i*)Branchtab); | |
| 483 | 65534 | a191 = *(a190); | |
| 484 | 65534 | a192 = _mm_xor_si128(a189, a191); | |
| 485 | 65534 | a193 = (b15 + 3); | |
| 486 | 65534 | a194 = *(a193); | |
| 487 | 65534 | a195 = _mm_set1_epi8(a194); | |
| 488 | 65534 | a196 = (a190 + 2); | |
| 489 | 65534 | a197 = *(a196); | |
| 490 | 65534 | a198 = _mm_xor_si128(a195, a197); | |
| 491 | 65534 | t25 = _mm_avg_epu8(a192, a198); | |
| 492 | 65534 | a199 = ((__m128i)t25); | |
| 493 | 65534 | a200 = _mm_srli_epi16(a199, 2); | |
| 494 | 65534 | a201 = ((__m128i)a200); | |
| 495 | 131068 | t26 = _mm_and_si128( | |
| 496 | a201, | ||
| 497 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 498 | 131068 | t27 = _mm_subs_epu8( | |
| 499 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 500 | t26); | ||
| 501 | 65534 | m39 = _mm_adds_epu8(s46, t26); | |
| 502 | 65534 | m40 = _mm_adds_epu8(s47, t27); | |
| 503 | 65534 | m41 = _mm_adds_epu8(s46, t27); | |
| 504 | 65534 | m42 = _mm_adds_epu8(s47, t26); | |
| 505 | 65534 | a202 = _mm_min_epu8(m40, m39); | |
| 506 | 65534 | d17 = _mm_cmpeq_epi8(a202, m40); | |
| 507 | 65534 | a203 = _mm_min_epu8(m42, m41); | |
| 508 | 65534 | d18 = _mm_cmpeq_epi8(a203, m42); | |
| 509 | 65534 | s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18)); | |
| 510 | 65534 | a204 = ((short int*)dec); | |
| 511 | 65534 | a205 = (8 * i9); | |
| 512 | 65534 | b16 = (a204 + a205); | |
| 513 | 65534 | a206 = (b16 + 4); | |
| 514 | 65534 | *(a206) = s48; | |
| 515 | 65534 | s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18)); | |
| 516 | 65534 | a207 = (b16 + 5); | |
| 517 | 65534 | *(a207) = s49; | |
| 518 | 65534 | s50 = _mm_unpacklo_epi8(a202, a203); | |
| 519 | 65534 | s51 = _mm_unpackhi_epi8(a202, a203); | |
| 520 | 65534 | a208 = ((__m128i*)X); | |
| 521 | 65534 | *(a208) = s50; | |
| 522 | 65534 | a209 = (a208 + 1); | |
| 523 | 65534 | *(a209) = s51; | |
| 524 | 65534 | a210 = (a184 + 1); | |
| 525 | 65534 | s52 = *(a210); | |
| 526 | 65534 | a211 = (a184 + 3); | |
| 527 | 65534 | s53 = *(a211); | |
| 528 | 65534 | a212 = (a190 + 1); | |
| 529 | 65534 | a213 = *(a212); | |
| 530 | 65534 | a214 = _mm_xor_si128(a189, a213); | |
| 531 | 65534 | a215 = (a190 + 3); | |
| 532 | 65534 | a216 = *(a215); | |
| 533 | 65534 | a217 = _mm_xor_si128(a195, a216); | |
| 534 | 65534 | t28 = _mm_avg_epu8(a214, a217); | |
| 535 | 65534 | a218 = ((__m128i)t28); | |
| 536 | 65534 | a219 = _mm_srli_epi16(a218, 2); | |
| 537 | 65534 | a220 = ((__m128i)a219); | |
| 538 | 131068 | t29 = _mm_and_si128( | |
| 539 | a220, | ||
| 540 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 541 | 131068 | t30 = _mm_subs_epu8( | |
| 542 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 543 | t29); | ||
| 544 | 65534 | m43 = _mm_adds_epu8(s52, t29); | |
| 545 | 65534 | m44 = _mm_adds_epu8(s53, t30); | |
| 546 | 65534 | m45 = _mm_adds_epu8(s52, t30); | |
| 547 | 65534 | m46 = _mm_adds_epu8(s53, t29); | |
| 548 | 65534 | a221 = _mm_min_epu8(m44, m43); | |
| 549 | 65534 | d19 = _mm_cmpeq_epi8(a221, m44); | |
| 550 | 65534 | a222 = _mm_min_epu8(m46, m45); | |
| 551 | 65534 | d20 = _mm_cmpeq_epi8(a222, m46); | |
| 552 | 65534 | s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20)); | |
| 553 | 65534 | a223 = (b16 + 6); | |
| 554 | 65534 | *(a223) = s54; | |
| 555 | 65534 | s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20)); | |
| 556 | 65534 | a224 = (b16 + 7); | |
| 557 | 65534 | *(a224) = s55; | |
| 558 | 65534 | s56 = _mm_unpacklo_epi8(a221, a222); | |
| 559 | 65534 | s57 = _mm_unpackhi_epi8(a221, a222); | |
| 560 | 65534 | a225 = (a208 + 2); | |
| 561 | 65534 | *(a225) = s56; | |
| 562 | 65534 | a226 = (a208 + 3); | |
| 563 | 65534 | *(a226) = s57; | |
| 564 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 65534 times.
|
65534 | if ((((unsigned char*)X)[0] > 210)) { |
| 565 | __m128i m12, m13; | ||
| 566 | ✗ | m12 = ((__m128i*)X)[0]; | |
| 567 | ✗ | m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); | |
| 568 | ✗ | m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); | |
| 569 | ✗ | m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); | |
| 570 | __m128i m14; | ||
| 571 | ✗ | m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); | |
| 572 | ✗ | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), | |
| 573 | ((__m128i)m14))); | ||
| 574 | ✗ | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), | |
| 575 | ((__m128i)m14))); | ||
| 576 | ✗ | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), | |
| 577 | ((__m128i)m14))); | ||
| 578 | ✗ | m14 = _mm_unpacklo_epi8(m14, m14); | |
| 579 | ✗ | m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); | |
| 580 | ✗ | m13 = _mm_unpacklo_epi64(m14, m14); | |
| 581 | ✗ | ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); | |
| 582 | ✗ | ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); | |
| 583 | ✗ | ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); | |
| 584 | ✗ | ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); | |
| 585 | } | ||
| 586 | } | ||
| 587 | |||
| 588 | 2 | renormalize(X, 210); | |
| 589 | |||
| 590 | /*int ch; | ||
| 591 | for(ch = 0; ch < 64; ch++) { | ||
| 592 | printf("%d,", X[ch]); | ||
| 593 | } | ||
| 594 | printf("\n");*/ | ||
| 595 | |||
| 596 | unsigned int j; | ||
| 597 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (j = 0; j < (framebits + excess) % 2; ++j) { |
| 598 | int i; | ||
| 599 |
2/2✓ Branch 0 taken 64 times.
✓ Branch 1 taken 2 times.
|
66 | for (i = 0; i < 64 / 2; i++) { |
| 600 | 64 | BFLY(i, | |
| 601 | 64 | (((framebits + excess) >> 1) << 1) + j, | |
| 602 | syms, | ||
| 603 | Y, | ||
| 604 | X, | ||
| 605 | (decision_t*)dec, | ||
| 606 | Branchtab); | ||
| 607 | } | ||
| 608 | |||
| 609 | |||
| 610 | 2 | renormalize(Y, 210); | |
| 611 | |||
| 612 | /*printf("\n"); | ||
| 613 | for(ch = 0; ch < 64; ch++) { | ||
| 614 | printf("%d,", Y[ch]); | ||
| 615 | } | ||
| 616 | printf("\n");*/ | ||
| 617 | } | ||
| 618 | /*skip*/ | ||
| 619 | 2 | } | |
| 620 | |||
| 621 | #endif /*LV_HAVE_SSE3*/ | ||
| 622 | |||
| 623 | #if LV_HAVE_NEON | ||
| 624 | |||
| 625 | #include "volk/sse2neon.h" | ||
| 626 | |||
| 627 | static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, | ||
| 628 | unsigned char* X, | ||
| 629 | unsigned char* syms, | ||
| 630 | unsigned char* dec, | ||
| 631 | unsigned int framebits, | ||
| 632 | unsigned int excess, | ||
| 633 | unsigned char* Branchtab) | ||
| 634 | { | ||
| 635 | unsigned int i9; | ||
| 636 | for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { | ||
| 637 | unsigned char a75, a81; | ||
| 638 | int a73, a92; | ||
| 639 | short int s20, s21, s26, s27; | ||
| 640 | unsigned char *a74, *a80, *b6; | ||
| 641 | short int *a110, *a111, *a91, *a93, *a94; | ||
| 642 | __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99; | ||
| 643 | __m128i a105, a106, a86, a87; | ||
| 644 | __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85, | ||
| 645 | a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18, | ||
| 646 | s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18; | ||
| 647 | a71 = ((__m128i*)X); | ||
| 648 | s18 = *(a71); | ||
| 649 | a72 = (a71 + 2); | ||
| 650 | s19 = *(a72); | ||
| 651 | a73 = (4 * i9); | ||
| 652 | a74 = (syms + a73); | ||
| 653 | a75 = *(a74); | ||
| 654 | a76 = _mm_set1_epi8(a75); | ||
| 655 | a77 = ((__m128i*)Branchtab); | ||
| 656 | a78 = *(a77); | ||
| 657 | a79 = _mm_xor_si128(a76, a78); | ||
| 658 | b6 = (a73 + syms); | ||
| 659 | a80 = (b6 + 1); | ||
| 660 | a81 = *(a80); | ||
| 661 | a82 = _mm_set1_epi8(a81); | ||
| 662 | a83 = (a77 + 2); | ||
| 663 | a84 = *(a83); | ||
| 664 | a85 = _mm_xor_si128(a82, a84); | ||
| 665 | t13 = _mm_avg_epu8(a79, a85); | ||
| 666 | a86 = ((__m128i)t13); | ||
| 667 | a87 = _mm_srli_epi16(a86, 2); | ||
| 668 | a88 = ((__m128i)a87); | ||
| 669 | t14 = _mm_and_si128( | ||
| 670 | a88, | ||
| 671 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 672 | t15 = _mm_subs_epu8( | ||
| 673 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 674 | t14); | ||
| 675 | m23 = _mm_adds_epu8(s18, t14); | ||
| 676 | m24 = _mm_adds_epu8(s19, t15); | ||
| 677 | m25 = _mm_adds_epu8(s18, t15); | ||
| 678 | m26 = _mm_adds_epu8(s19, t14); | ||
| 679 | a89 = _mm_min_epu8(m24, m23); | ||
| 680 | d9 = _mm_cmpeq_epi8(a89, m24); | ||
| 681 | a90 = _mm_min_epu8(m26, m25); | ||
| 682 | d10 = _mm_cmpeq_epi8(a90, m26); | ||
| 683 | s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10)); | ||
| 684 | a91 = ((short int*)dec); | ||
| 685 | a92 = (8 * i9); | ||
| 686 | a93 = (a91 + a92); | ||
| 687 | *(a93) = s20; | ||
| 688 | s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10)); | ||
| 689 | a94 = (a93 + 1); | ||
| 690 | *(a94) = s21; | ||
| 691 | s22 = _mm_unpacklo_epi8(a89, a90); | ||
| 692 | s23 = _mm_unpackhi_epi8(a89, a90); | ||
| 693 | a95 = ((__m128i*)Y); | ||
| 694 | *(a95) = s22; | ||
| 695 | a96 = (a95 + 1); | ||
| 696 | *(a96) = s23; | ||
| 697 | a97 = (a71 + 1); | ||
| 698 | s24 = *(a97); | ||
| 699 | a98 = (a71 + 3); | ||
| 700 | s25 = *(a98); | ||
| 701 | a99 = (a77 + 1); | ||
| 702 | a100 = *(a99); | ||
| 703 | a101 = _mm_xor_si128(a76, a100); | ||
| 704 | a102 = (a77 + 3); | ||
| 705 | a103 = *(a102); | ||
| 706 | a104 = _mm_xor_si128(a82, a103); | ||
| 707 | t16 = _mm_avg_epu8(a101, a104); | ||
| 708 | a105 = ((__m128i)t16); | ||
| 709 | a106 = _mm_srli_epi16(a105, 2); | ||
| 710 | a107 = ((__m128i)a106); | ||
| 711 | t17 = _mm_and_si128( | ||
| 712 | a107, | ||
| 713 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 714 | t18 = _mm_subs_epu8( | ||
| 715 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 716 | t17); | ||
| 717 | m27 = _mm_adds_epu8(s24, t17); | ||
| 718 | m28 = _mm_adds_epu8(s25, t18); | ||
| 719 | m29 = _mm_adds_epu8(s24, t18); | ||
| 720 | m30 = _mm_adds_epu8(s25, t17); | ||
| 721 | a108 = _mm_min_epu8(m28, m27); | ||
| 722 | d11 = _mm_cmpeq_epi8(a108, m28); | ||
| 723 | a109 = _mm_min_epu8(m30, m29); | ||
| 724 | d12 = _mm_cmpeq_epi8(a109, m30); | ||
| 725 | s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12)); | ||
| 726 | a110 = (a93 + 2); | ||
| 727 | *(a110) = s26; | ||
| 728 | s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12)); | ||
| 729 | a111 = (a93 + 3); | ||
| 730 | *(a111) = s27; | ||
| 731 | s28 = _mm_unpacklo_epi8(a108, a109); | ||
| 732 | s29 = _mm_unpackhi_epi8(a108, a109); | ||
| 733 | a112 = (a95 + 2); | ||
| 734 | *(a112) = s28; | ||
| 735 | a113 = (a95 + 3); | ||
| 736 | *(a113) = s29; | ||
| 737 | if ((((unsigned char*)Y)[0] > 210)) { | ||
| 738 | __m128i m5, m6; | ||
| 739 | m5 = ((__m128i*)Y)[0]; | ||
| 740 | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); | ||
| 741 | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); | ||
| 742 | m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); | ||
| 743 | __m128i m7; | ||
| 744 | m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); | ||
| 745 | m7 = | ||
| 746 | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); | ||
| 747 | m7 = | ||
| 748 | ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); | ||
| 749 | m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); | ||
| 750 | m7 = _mm_unpacklo_epi8(m7, m7); | ||
| 751 | m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); | ||
| 752 | m6 = _mm_unpacklo_epi64(m7, m7); | ||
| 753 | ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); | ||
| 754 | ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); | ||
| 755 | ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); | ||
| 756 | ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); | ||
| 757 | } | ||
| 758 | unsigned char a188, a194; | ||
| 759 | int a186, a205; | ||
| 760 | short int s48, s49, s54, s55; | ||
| 761 | unsigned char *a187, *a193, *b15; | ||
| 762 | short int *a204, *a206, *a207, *a223, *a224, *b16; | ||
| 763 | __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215, | ||
| 764 | *a225, *a226; | ||
| 765 | __m128i a199, a200, a218, a219; | ||
| 766 | __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216, | ||
| 767 | a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45, | ||
| 768 | m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30; | ||
| 769 | a184 = ((__m128i*)Y); | ||
| 770 | s46 = *(a184); | ||
| 771 | a185 = (a184 + 2); | ||
| 772 | s47 = *(a185); | ||
| 773 | a186 = (4 * i9); | ||
| 774 | b15 = (a186 + syms); | ||
| 775 | a187 = (b15 + 2); | ||
| 776 | a188 = *(a187); | ||
| 777 | a189 = _mm_set1_epi8(a188); | ||
| 778 | a190 = ((__m128i*)Branchtab); | ||
| 779 | a191 = *(a190); | ||
| 780 | a192 = _mm_xor_si128(a189, a191); | ||
| 781 | a193 = (b15 + 3); | ||
| 782 | a194 = *(a193); | ||
| 783 | a195 = _mm_set1_epi8(a194); | ||
| 784 | a196 = (a190 + 2); | ||
| 785 | a197 = *(a196); | ||
| 786 | a198 = _mm_xor_si128(a195, a197); | ||
| 787 | t25 = _mm_avg_epu8(a192, a198); | ||
| 788 | a199 = ((__m128i)t25); | ||
| 789 | a200 = _mm_srli_epi16(a199, 2); | ||
| 790 | a201 = ((__m128i)a200); | ||
| 791 | t26 = _mm_and_si128( | ||
| 792 | a201, | ||
| 793 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 794 | t27 = _mm_subs_epu8( | ||
| 795 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 796 | t26); | ||
| 797 | m39 = _mm_adds_epu8(s46, t26); | ||
| 798 | m40 = _mm_adds_epu8(s47, t27); | ||
| 799 | m41 = _mm_adds_epu8(s46, t27); | ||
| 800 | m42 = _mm_adds_epu8(s47, t26); | ||
| 801 | a202 = _mm_min_epu8(m40, m39); | ||
| 802 | d17 = _mm_cmpeq_epi8(a202, m40); | ||
| 803 | a203 = _mm_min_epu8(m42, m41); | ||
| 804 | d18 = _mm_cmpeq_epi8(a203, m42); | ||
| 805 | s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18)); | ||
| 806 | a204 = ((short int*)dec); | ||
| 807 | a205 = (8 * i9); | ||
| 808 | b16 = (a204 + a205); | ||
| 809 | a206 = (b16 + 4); | ||
| 810 | *(a206) = s48; | ||
| 811 | s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18)); | ||
| 812 | a207 = (b16 + 5); | ||
| 813 | *(a207) = s49; | ||
| 814 | s50 = _mm_unpacklo_epi8(a202, a203); | ||
| 815 | s51 = _mm_unpackhi_epi8(a202, a203); | ||
| 816 | a208 = ((__m128i*)X); | ||
| 817 | *(a208) = s50; | ||
| 818 | a209 = (a208 + 1); | ||
| 819 | *(a209) = s51; | ||
| 820 | a210 = (a184 + 1); | ||
| 821 | s52 = *(a210); | ||
| 822 | a211 = (a184 + 3); | ||
| 823 | s53 = *(a211); | ||
| 824 | a212 = (a190 + 1); | ||
| 825 | a213 = *(a212); | ||
| 826 | a214 = _mm_xor_si128(a189, a213); | ||
| 827 | a215 = (a190 + 3); | ||
| 828 | a216 = *(a215); | ||
| 829 | a217 = _mm_xor_si128(a195, a216); | ||
| 830 | t28 = _mm_avg_epu8(a214, a217); | ||
| 831 | a218 = ((__m128i)t28); | ||
| 832 | a219 = _mm_srli_epi16(a218, 2); | ||
| 833 | a220 = ((__m128i)a219); | ||
| 834 | t29 = _mm_and_si128( | ||
| 835 | a220, | ||
| 836 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); | ||
| 837 | t30 = _mm_subs_epu8( | ||
| 838 | _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), | ||
| 839 | t29); | ||
| 840 | m43 = _mm_adds_epu8(s52, t29); | ||
| 841 | m44 = _mm_adds_epu8(s53, t30); | ||
| 842 | m45 = _mm_adds_epu8(s52, t30); | ||
| 843 | m46 = _mm_adds_epu8(s53, t29); | ||
| 844 | a221 = _mm_min_epu8(m44, m43); | ||
| 845 | d19 = _mm_cmpeq_epi8(a221, m44); | ||
| 846 | a222 = _mm_min_epu8(m46, m45); | ||
| 847 | d20 = _mm_cmpeq_epi8(a222, m46); | ||
| 848 | s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20)); | ||
| 849 | a223 = (b16 + 6); | ||
| 850 | *(a223) = s54; | ||
| 851 | s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20)); | ||
| 852 | a224 = (b16 + 7); | ||
| 853 | *(a224) = s55; | ||
| 854 | s56 = _mm_unpacklo_epi8(a221, a222); | ||
| 855 | s57 = _mm_unpackhi_epi8(a221, a222); | ||
| 856 | a225 = (a208 + 2); | ||
| 857 | *(a225) = s56; | ||
| 858 | a226 = (a208 + 3); | ||
| 859 | *(a226) = s57; | ||
| 860 | if ((((unsigned char*)X)[0] > 210)) { | ||
| 861 | __m128i m12, m13; | ||
| 862 | m12 = ((__m128i*)X)[0]; | ||
| 863 | m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); | ||
| 864 | m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); | ||
| 865 | m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); | ||
| 866 | __m128i m14; | ||
| 867 | m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); | ||
| 868 | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), | ||
| 869 | ((__m128i)m14))); | ||
| 870 | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), | ||
| 871 | ((__m128i)m14))); | ||
| 872 | m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), | ||
| 873 | ((__m128i)m14))); | ||
| 874 | m14 = _mm_unpacklo_epi8(m14, m14); | ||
| 875 | m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); | ||
| 876 | m13 = _mm_unpacklo_epi64(m14, m14); | ||
| 877 | ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); | ||
| 878 | ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); | ||
| 879 | ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); | ||
| 880 | ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); | ||
| 881 | } | ||
| 882 | } | ||
| 883 | |||
| 884 | renormalize(X, 210); | ||
| 885 | |||
| 886 | /*int ch; | ||
| 887 | for(ch = 0; ch < 64; ch++) { | ||
| 888 | printf("%d,", X[ch]); | ||
| 889 | } | ||
| 890 | printf("\n");*/ | ||
| 891 | |||
| 892 | unsigned int j; | ||
| 893 | for (j = 0; j < (framebits + excess) % 2; ++j) { | ||
| 894 | int i; | ||
| 895 | for (i = 0; i < 64 / 2; i++) { | ||
| 896 | BFLY(i, | ||
| 897 | (((framebits + excess) >> 1) << 1) + j, | ||
| 898 | syms, | ||
| 899 | Y, | ||
| 900 | X, | ||
| 901 | (decision_t*)dec, | ||
| 902 | Branchtab); | ||
| 903 | } | ||
| 904 | |||
| 905 | |||
| 906 | renormalize(Y, 210); | ||
| 907 | |||
| 908 | /*printf("\n"); | ||
| 909 | for(ch = 0; ch < 64; ch++) { | ||
| 910 | printf("%d,", Y[ch]); | ||
| 911 | } | ||
| 912 | printf("\n");*/ | ||
| 913 | } | ||
| 914 | /*skip*/ | ||
| 915 | } | ||
| 916 | |||
| 917 | #endif /*LV_HAVE_NEON*/ | ||
| 918 | |||
| 919 | #if LV_HAVE_GENERIC | ||
| 920 | |||
| 921 | 2 | static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, | |
| 922 | unsigned char* X, | ||
| 923 | unsigned char* syms, | ||
| 924 | unsigned char* dec, | ||
| 925 | unsigned int framebits, | ||
| 926 | unsigned int excess, | ||
| 927 | unsigned char* Branchtab) | ||
| 928 | { | ||
| 929 | 2 | int nbits = framebits + excess; | |
| 930 | 2 | int NUMSTATES = 64; | |
| 931 | 2 | int RENORMALIZE_THRESHOLD = 210; | |
| 932 | |||
| 933 | int s, i; | ||
| 934 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (s = 0; s < nbits; s++) { |
| 935 | void* tmp; | ||
| 936 |
2/2✓ Branch 0 taken 4194240 times.
✓ Branch 1 taken 131070 times.
|
4325310 | for (i = 0; i < NUMSTATES / 2; i++) { |
| 937 | 4194240 | BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab); | |
| 938 | } | ||
| 939 | |||
| 940 | 131070 | renormalize(Y, RENORMALIZE_THRESHOLD); | |
| 941 | |||
| 942 | /// Swap pointers to old and new metrics | ||
| 943 | 131070 | tmp = (void*)X; | |
| 944 | 131070 | X = Y; | |
| 945 | 131070 | Y = (unsigned char*)tmp; | |
| 946 | } | ||
| 947 | 2 | } | |
| 948 | |||
| 949 | #endif /* LV_HAVE_GENERIC */ | ||
| 950 | |||
| 951 | #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/ | ||
| 952 |