Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2015 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /* | ||
11 | * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h' | ||
12 | */ | ||
13 | |||
14 | #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ | ||
15 | #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ | ||
16 | #include <string.h> | ||
17 | |||
18 | 2072 | static inline unsigned int log2_of_power_of_2(unsigned int val) | |
19 | { | ||
20 | // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog | ||
21 | static const unsigned int b[] = { | ||
22 | 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000 | ||
23 | }; | ||
24 | |||
25 | 2072 | unsigned int res = (val & b[0]) != 0; | |
26 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2056 times.
|
2072 | res |= ((val & b[4]) != 0) << 4; |
27 |
2/2✓ Branch 0 taken 72 times.
✓ Branch 1 taken 2000 times.
|
2072 | res |= ((val & b[3]) != 0) << 3; |
28 |
2/2✓ Branch 0 taken 972 times.
✓ Branch 1 taken 1100 times.
|
2072 | res |= ((val & b[2]) != 0) << 2; |
29 |
2/2✓ Branch 0 taken 1228 times.
✓ Branch 1 taken 844 times.
|
2072 | res |= ((val & b[1]) != 0) << 1; |
30 | 2072 | return res; | |
31 | } | ||
32 | |||
33 | 4200 | static inline void encodepolar_single_stage(unsigned char* frame_ptr, | |
34 | const unsigned char* temp_ptr, | ||
35 | const unsigned int num_branches, | ||
36 | const unsigned int frame_half) | ||
37 | { | ||
38 | unsigned int branch, bit; | ||
39 |
2/2✓ Branch 0 taken 166648 times.
✓ Branch 1 taken 4200 times.
|
170848 | for (branch = 0; branch < num_branches; ++branch) { |
40 |
2/2✓ Branch 0 taken 1216512 times.
✓ Branch 1 taken 166648 times.
|
1383160 | for (bit = 0; bit < frame_half; ++bit) { |
41 | 1216512 | *frame_ptr = *temp_ptr ^ *(temp_ptr + 1); | |
42 | 1216512 | *(frame_ptr + frame_half) = *(temp_ptr + 1); | |
43 | 1216512 | ++frame_ptr; | |
44 | 1216512 | temp_ptr += 2; | |
45 | } | ||
46 | 166648 | frame_ptr += frame_half; | |
47 | } | ||
48 | 4200 | } | |
49 | |||
50 | #ifdef LV_HAVE_GENERIC | ||
51 | |||
52 | 1288 | static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, | |
53 | unsigned char* temp, | ||
54 | unsigned int frame_size) | ||
55 | { | ||
56 | 1288 | unsigned int stage = log2_of_power_of_2(frame_size); | |
57 | 1288 | unsigned int frame_half = frame_size >> 1; | |
58 | 1288 | unsigned int num_branches = 1; | |
59 | |||
60 |
2/2✓ Branch 0 taken 4200 times.
✓ Branch 1 taken 1288 times.
|
5488 | while (stage) { |
61 | // encode stage | ||
62 | 4200 | encodepolar_single_stage(frame, temp, num_branches, frame_half); | |
63 | 4200 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
64 | |||
65 | // update all the parameters. | ||
66 | 4200 | num_branches = num_branches << 1; | |
67 | 4200 | frame_half = frame_half >> 1; | |
68 | 4200 | --stage; | |
69 | } | ||
70 | 1288 | } | |
71 | #endif /* LV_HAVE_GENERIC */ | ||
72 | |||
73 | #ifdef LV_HAVE_SSSE3 | ||
74 | #include <tmmintrin.h> | ||
75 | |||
76 | 1024 | static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, | |
77 | unsigned char* temp, | ||
78 | unsigned int frame_size) | ||
79 | { | ||
80 |
2/2✓ Branch 0 taken 512 times.
✓ Branch 1 taken 512 times.
|
1024 | if (frame_size < 16) { |
81 | 512 | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
82 | 512 | return; | |
83 | } | ||
84 | |||
85 | 512 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
86 | |||
87 | 512 | unsigned int stage = po2; | |
88 | 512 | unsigned char* frame_ptr = frame; | |
89 | 512 | unsigned char* temp_ptr = temp; | |
90 | |||
91 | 512 | unsigned int frame_half = frame_size >> 1; | |
92 | 512 | unsigned int num_branches = 1; | |
93 | unsigned int branch; | ||
94 | unsigned int bit; | ||
95 | |||
96 | // prepare constants | ||
97 | 512 | const __m128i mask_stage1 = _mm_set_epi8(0x0, | |
98 | 0xFF, | ||
99 | 0x0, | ||
100 | 0xFF, | ||
101 | 0x0, | ||
102 | 0xFF, | ||
103 | 0x0, | ||
104 | 0xFF, | ||
105 | 0x0, | ||
106 | 0xFF, | ||
107 | 0x0, | ||
108 | 0xFF, | ||
109 | 0x0, | ||
110 | 0xFF, | ||
111 | 0x0, | ||
112 | 0xFF); | ||
113 | |||
114 | // get some SIMD registers to play with. | ||
115 | __m128i r_frame0, r_temp0, shifted; | ||
116 | |||
117 | { | ||
118 | __m128i r_frame1, r_temp1; | ||
119 | const __m128i shuffle_separate = | ||
120 | 512 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
121 | |||
122 |
2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 512 times.
|
1030 | while (stage > 4) { |
123 | 518 | frame_ptr = frame; | |
124 | 518 | temp_ptr = temp; | |
125 | |||
126 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
127 |
2/2✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.
|
10246 | for (branch = 0; branch < num_branches; ++branch) { |
128 |
2/2✓ Branch 0 taken 52736 times.
✓ Branch 1 taken 9728 times.
|
62464 | for (bit = 0; bit < frame_half; bit += 16) { |
129 | 52736 | r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); | |
130 | 52736 | temp_ptr += 16; | |
131 | 52736 | r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr); | |
132 | 52736 | temp_ptr += 16; | |
133 | |||
134 | 52736 | shifted = _mm_srli_si128(r_temp0, 1); | |
135 | 52736 | shifted = _mm_and_si128(shifted, mask_stage1); | |
136 | 52736 | r_temp0 = _mm_xor_si128(shifted, r_temp0); | |
137 | 52736 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); | |
138 | |||
139 | 52736 | shifted = _mm_srli_si128(r_temp1, 1); | |
140 | 52736 | shifted = _mm_and_si128(shifted, mask_stage1); | |
141 | 52736 | r_temp1 = _mm_xor_si128(shifted, r_temp1); | |
142 | 52736 | r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); | |
143 | |||
144 | 52736 | r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); | |
145 | _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); | ||
146 | |||
147 | 52736 | r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); | |
148 | 52736 | _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1); | |
149 | 52736 | frame_ptr += 16; | |
150 | } | ||
151 | |||
152 | 9728 | frame_ptr += frame_half; | |
153 | } | ||
154 | 518 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
155 | |||
156 | 518 | num_branches = num_branches << 1; | |
157 | 518 | frame_half = frame_half >> 1; | |
158 | 518 | stage--; | |
159 | } | ||
160 | } | ||
161 | |||
162 | // This last part requires at least 16-bit frames. | ||
163 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
164 | |||
165 | // reset pointers to correct positions. | ||
166 | 512 | frame_ptr = frame; | |
167 | 512 | temp_ptr = temp; | |
168 | |||
169 | // prefetch first chunk | ||
170 | 512 | __VOLK_PREFETCH(temp_ptr); | |
171 | |||
172 | const __m128i shuffle_stage4 = | ||
173 | 512 | _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); | |
174 | 512 | const __m128i mask_stage4 = _mm_set_epi8(0x0, | |
175 | 0x0, | ||
176 | 0x0, | ||
177 | 0x0, | ||
178 | 0x0, | ||
179 | 0x0, | ||
180 | 0x0, | ||
181 | 0x0, | ||
182 | 0xFF, | ||
183 | 0xFF, | ||
184 | 0xFF, | ||
185 | 0xFF, | ||
186 | 0xFF, | ||
187 | 0xFF, | ||
188 | 0xFF, | ||
189 | 0xFF); | ||
190 | 512 | const __m128i mask_stage3 = _mm_set_epi8(0x0, | |
191 | 0x0, | ||
192 | 0x0, | ||
193 | 0x0, | ||
194 | 0xFF, | ||
195 | 0xFF, | ||
196 | 0xFF, | ||
197 | 0xFF, | ||
198 | 0x0, | ||
199 | 0x0, | ||
200 | 0x0, | ||
201 | 0x0, | ||
202 | 0xFF, | ||
203 | 0xFF, | ||
204 | 0xFF, | ||
205 | 0xFF); | ||
206 | 512 | const __m128i mask_stage2 = _mm_set_epi8(0x0, | |
207 | 0x0, | ||
208 | 0xFF, | ||
209 | 0xFF, | ||
210 | 0x0, | ||
211 | 0x0, | ||
212 | 0xFF, | ||
213 | 0xFF, | ||
214 | 0x0, | ||
215 | 0x0, | ||
216 | 0xFF, | ||
217 | 0xFF, | ||
218 | 0x0, | ||
219 | 0x0, | ||
220 | 0xFF, | ||
221 | 0xFF); | ||
222 | |||
223 |
2/2✓ Branch 0 taken 10240 times.
✓ Branch 1 taken 512 times.
|
10752 | for (branch = 0; branch < num_branches; ++branch) { |
224 | 10240 | r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); | |
225 | |||
226 | // prefetch next chunk | ||
227 | 10240 | temp_ptr += 16; | |
228 | 10240 | __VOLK_PREFETCH(temp_ptr); | |
229 | |||
230 | // shuffle once for bit-reversal. | ||
231 | 10240 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); | |
232 | |||
233 | 10240 | shifted = _mm_srli_si128(r_temp0, 8); | |
234 | 10240 | shifted = _mm_and_si128(shifted, mask_stage4); | |
235 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_temp0); | |
236 | |||
237 | 10240 | shifted = _mm_srli_si128(r_frame0, 4); | |
238 | 10240 | shifted = _mm_and_si128(shifted, mask_stage3); | |
239 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
240 | |||
241 | 10240 | shifted = _mm_srli_si128(r_frame0, 2); | |
242 | 10240 | shifted = _mm_and_si128(shifted, mask_stage2); | |
243 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
244 | |||
245 | 10240 | shifted = _mm_srli_si128(r_frame0, 1); | |
246 | 10240 | shifted = _mm_and_si128(shifted, mask_stage1); | |
247 | 10240 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
248 | |||
249 | // store result of chunk. | ||
250 | _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); | ||
251 | 10240 | frame_ptr += 16; | |
252 | } | ||
253 | } | ||
254 | |||
255 | #endif /* LV_HAVE_SSSE3 */ | ||
256 | |||
257 | #ifdef LV_HAVE_AVX2 | ||
258 | #include <immintrin.h> | ||
259 | |||
260 | 1024 | static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, | |
261 | unsigned char* temp, | ||
262 | unsigned int frame_size) | ||
263 | { | ||
264 |
2/2✓ Branch 0 taken 768 times.
✓ Branch 1 taken 256 times.
|
1024 | if (frame_size < 32) { |
265 | 768 | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
266 | 768 | return; | |
267 | } | ||
268 | |||
269 | 256 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
270 | |||
271 | 256 | unsigned int stage = po2; | |
272 | 256 | unsigned char* frame_ptr = frame; | |
273 | 256 | unsigned char* temp_ptr = temp; | |
274 | |||
275 | 256 | unsigned int frame_half = frame_size >> 1; | |
276 | 256 | unsigned int num_branches = 1; | |
277 | unsigned int branch; | ||
278 | unsigned int bit; | ||
279 | |||
280 | // prepare constants | ||
281 | 256 | const __m256i mask_stage1 = _mm256_set_epi8(0x0, | |
282 | 0xFF, | ||
283 | 0x0, | ||
284 | 0xFF, | ||
285 | 0x0, | ||
286 | 0xFF, | ||
287 | 0x0, | ||
288 | 0xFF, | ||
289 | 0x0, | ||
290 | 0xFF, | ||
291 | 0x0, | ||
292 | 0xFF, | ||
293 | 0x0, | ||
294 | 0xFF, | ||
295 | 0x0, | ||
296 | 0xFF, | ||
297 | 0x0, | ||
298 | 0xFF, | ||
299 | 0x0, | ||
300 | 0xFF, | ||
301 | 0x0, | ||
302 | 0xFF, | ||
303 | 0x0, | ||
304 | 0xFF, | ||
305 | 0x0, | ||
306 | 0xFF, | ||
307 | 0x0, | ||
308 | 0xFF, | ||
309 | 0x0, | ||
310 | 0xFF, | ||
311 | 0x0, | ||
312 | 0xFF); | ||
313 | |||
314 | 256 | const __m128i mask_stage0 = _mm_set_epi8(0x0, | |
315 | 0xFF, | ||
316 | 0x0, | ||
317 | 0xFF, | ||
318 | 0x0, | ||
319 | 0xFF, | ||
320 | 0x0, | ||
321 | 0xFF, | ||
322 | 0x0, | ||
323 | 0xFF, | ||
324 | 0x0, | ||
325 | 0xFF, | ||
326 | 0x0, | ||
327 | 0xFF, | ||
328 | 0x0, | ||
329 | 0xFF); | ||
330 | // get some SIMD registers to play with. | ||
331 | __m256i r_frame0, r_temp0, shifted; | ||
332 | __m128i r_temp2, r_frame2, shifted2; | ||
333 | { | ||
334 | __m256i r_frame1, r_temp1; | ||
335 | __m128i r_frame3, r_temp3; | ||
336 | 256 | const __m256i shuffle_separate = _mm256_setr_epi8(0, | |
337 | 2, | ||
338 | 4, | ||
339 | 6, | ||
340 | 8, | ||
341 | 10, | ||
342 | 12, | ||
343 | 14, | ||
344 | 1, | ||
345 | 3, | ||
346 | 5, | ||
347 | 7, | ||
348 | 9, | ||
349 | 11, | ||
350 | 13, | ||
351 | 15, | ||
352 | 0, | ||
353 | 2, | ||
354 | 4, | ||
355 | 6, | ||
356 | 8, | ||
357 | 10, | ||
358 | 12, | ||
359 | 14, | ||
360 | 1, | ||
361 | 3, | ||
362 | 5, | ||
363 | 7, | ||
364 | 9, | ||
365 | 11, | ||
366 | 13, | ||
367 | 15); | ||
368 | const __m128i shuffle_separate128 = | ||
369 | 256 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
370 | |||
371 |
2/2✓ Branch 0 taken 518 times.
✓ Branch 1 taken 256 times.
|
774 | while (stage > 4) { |
372 | 518 | frame_ptr = frame; | |
373 | 518 | temp_ptr = temp; | |
374 | |||
375 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
376 |
2/2✓ Branch 0 taken 9728 times.
✓ Branch 1 taken 518 times.
|
10246 | for (branch = 0; branch < num_branches; ++branch) { |
377 |
2/2✓ Branch 0 taken 28864 times.
✓ Branch 1 taken 4736 times.
|
33600 | for (bit = 0; bit < frame_half; bit += 32) { |
378 |
2/2✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 23872 times.
|
28864 | if ((frame_half - bit) < |
379 | 32) // if only 16 bits remaining in frame, not 32 | ||
380 | { | ||
381 | 4992 | r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr); | |
382 | 4992 | temp_ptr += 16; | |
383 | 4992 | r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr); | |
384 | 4992 | temp_ptr += 16; | |
385 | |||
386 | 4992 | shifted2 = _mm_srli_si128(r_temp2, 1); | |
387 | 4992 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
388 | 4992 | r_temp2 = _mm_xor_si128(shifted2, r_temp2); | |
389 | 4992 | r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); | |
390 | |||
391 | 4992 | shifted2 = _mm_srli_si128(r_temp3, 1); | |
392 | 4992 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
393 | 4992 | r_temp3 = _mm_xor_si128(shifted2, r_temp3); | |
394 | 4992 | r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); | |
395 | |||
396 | 4992 | r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); | |
397 | _mm_storeu_si128((__m128i*)frame_ptr, r_frame2); | ||
398 | |||
399 | 4992 | r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); | |
400 | 4992 | _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3); | |
401 | 4992 | frame_ptr += 16; | |
402 | 4992 | break; | |
403 | } | ||
404 | 23872 | r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); | |
405 | 23872 | temp_ptr += 32; | |
406 | 23872 | r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr); | |
407 | 23872 | temp_ptr += 32; | |
408 | |||
409 | 23872 | shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes | |
410 | 23872 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
411 | 23872 | r_temp0 = _mm256_xor_si256(shifted, r_temp0); | |
412 | 23872 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); | |
413 | |||
414 | 23872 | shifted = _mm256_srli_si256(r_temp1, 1); | |
415 | 23872 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
416 | 23872 | r_temp1 = _mm256_xor_si256(shifted, r_temp1); | |
417 | 23872 | r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); | |
418 | |||
419 | 23872 | r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); | |
420 | 23872 | r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); | |
421 | 23872 | r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); | |
422 | 23872 | r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); | |
423 | |||
424 | _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); | ||
425 | |||
426 | 23872 | _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1); | |
427 | 23872 | frame_ptr += 32; | |
428 | } | ||
429 | |||
430 | 9728 | frame_ptr += frame_half; | |
431 | } | ||
432 | 518 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
433 | |||
434 | 518 | num_branches = num_branches << 1; | |
435 | 518 | frame_half = frame_half >> 1; | |
436 | 518 | stage--; | |
437 | } | ||
438 | } | ||
439 | |||
440 | // This last part requires at least 32-bit frames. | ||
441 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
442 | |||
443 | // reset pointers to correct positions. | ||
444 | 256 | frame_ptr = frame; | |
445 | 256 | temp_ptr = temp; | |
446 | |||
447 | // prefetch first chunk | ||
448 | 256 | __VOLK_PREFETCH(temp_ptr); | |
449 | |||
450 | 256 | const __m256i shuffle_stage4 = _mm256_setr_epi8(0, | |
451 | 8, | ||
452 | 4, | ||
453 | 12, | ||
454 | 2, | ||
455 | 10, | ||
456 | 6, | ||
457 | 14, | ||
458 | 1, | ||
459 | 9, | ||
460 | 5, | ||
461 | 13, | ||
462 | 3, | ||
463 | 11, | ||
464 | 7, | ||
465 | 15, | ||
466 | 0, | ||
467 | 8, | ||
468 | 4, | ||
469 | 12, | ||
470 | 2, | ||
471 | 10, | ||
472 | 6, | ||
473 | 14, | ||
474 | 1, | ||
475 | 9, | ||
476 | 5, | ||
477 | 13, | ||
478 | 3, | ||
479 | 11, | ||
480 | 7, | ||
481 | 15); | ||
482 | 256 | const __m256i mask_stage4 = _mm256_set_epi8(0x0, | |
483 | 0x0, | ||
484 | 0x0, | ||
485 | 0x0, | ||
486 | 0x0, | ||
487 | 0x0, | ||
488 | 0x0, | ||
489 | 0x0, | ||
490 | 0xFF, | ||
491 | 0xFF, | ||
492 | 0xFF, | ||
493 | 0xFF, | ||
494 | 0xFF, | ||
495 | 0xFF, | ||
496 | 0xFF, | ||
497 | 0xFF, | ||
498 | 0x0, | ||
499 | 0x0, | ||
500 | 0x0, | ||
501 | 0x0, | ||
502 | 0x0, | ||
503 | 0x0, | ||
504 | 0x0, | ||
505 | 0x0, | ||
506 | 0xFF, | ||
507 | 0xFF, | ||
508 | 0xFF, | ||
509 | 0xFF, | ||
510 | 0xFF, | ||
511 | 0xFF, | ||
512 | 0xFF, | ||
513 | 0xFF); | ||
514 | 256 | const __m256i mask_stage3 = _mm256_set_epi8(0x0, | |
515 | 0x0, | ||
516 | 0x0, | ||
517 | 0x0, | ||
518 | 0xFF, | ||
519 | 0xFF, | ||
520 | 0xFF, | ||
521 | 0xFF, | ||
522 | 0x0, | ||
523 | 0x0, | ||
524 | 0x0, | ||
525 | 0x0, | ||
526 | 0xFF, | ||
527 | 0xFF, | ||
528 | 0xFF, | ||
529 | 0xFF, | ||
530 | 0x0, | ||
531 | 0x0, | ||
532 | 0x0, | ||
533 | 0x0, | ||
534 | 0xFF, | ||
535 | 0xFF, | ||
536 | 0xFF, | ||
537 | 0xFF, | ||
538 | 0x0, | ||
539 | 0x0, | ||
540 | 0x0, | ||
541 | 0x0, | ||
542 | 0xFF, | ||
543 | 0xFF, | ||
544 | 0xFF, | ||
545 | 0xFF); | ||
546 | 256 | const __m256i mask_stage2 = _mm256_set_epi8(0x0, | |
547 | 0x0, | ||
548 | 0xFF, | ||
549 | 0xFF, | ||
550 | 0x0, | ||
551 | 0x0, | ||
552 | 0xFF, | ||
553 | 0xFF, | ||
554 | 0x0, | ||
555 | 0x0, | ||
556 | 0xFF, | ||
557 | 0xFF, | ||
558 | 0x0, | ||
559 | 0x0, | ||
560 | 0xFF, | ||
561 | 0xFF, | ||
562 | 0x0, | ||
563 | 0x0, | ||
564 | 0xFF, | ||
565 | 0xFF, | ||
566 | 0x0, | ||
567 | 0x0, | ||
568 | 0xFF, | ||
569 | 0xFF, | ||
570 | 0x0, | ||
571 | 0x0, | ||
572 | 0xFF, | ||
573 | 0xFF, | ||
574 | 0x0, | ||
575 | 0x0, | ||
576 | 0xFF, | ||
577 | 0xFF); | ||
578 | |||
579 |
2/2✓ Branch 0 taken 4992 times.
✓ Branch 1 taken 256 times.
|
5248 | for (branch = 0; branch < num_branches / 2; ++branch) { |
580 | 4992 | r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); | |
581 | |||
582 | // prefetch next chunk | ||
583 | 4992 | temp_ptr += 32; | |
584 | 4992 | __VOLK_PREFETCH(temp_ptr); | |
585 | |||
586 | // shuffle once for bit-reversal. | ||
587 | 4992 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); | |
588 | |||
589 | 4992 | shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes | |
590 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage4); | |
591 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_temp0); | |
592 | |||
593 | |||
594 | 4992 | shifted = _mm256_srli_si256(r_frame0, 4); | |
595 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage3); | |
596 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
597 | |||
598 | 4992 | shifted = _mm256_srli_si256(r_frame0, 2); | |
599 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage2); | |
600 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
601 | |||
602 | 4992 | shifted = _mm256_srli_si256(r_frame0, 1); | |
603 | 4992 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
604 | 4992 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
605 | |||
606 | // store result of chunk. | ||
607 | _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); | ||
608 | 4992 | frame_ptr += 32; | |
609 | } | ||
610 | } | ||
611 | #endif /* LV_HAVE_AVX2 */ | ||
612 | |||
613 | #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */ | ||
614 | |||
615 | #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ | ||
616 | #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ | ||
617 | |||
618 | #ifdef LV_HAVE_SSSE3 | ||
619 | #include <tmmintrin.h> | ||
620 | |||
621 | 2 | static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, | |
622 | unsigned char* temp, | ||
623 | unsigned int frame_size) | ||
624 | { | ||
625 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (frame_size < 16) { |
626 | ✗ | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
627 | ✗ | return; | |
628 | } | ||
629 | |||
630 | 2 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
631 | |||
632 | 2 | unsigned int stage = po2; | |
633 | 2 | unsigned char* frame_ptr = frame; | |
634 | 2 | unsigned char* temp_ptr = temp; | |
635 | |||
636 | 2 | unsigned int frame_half = frame_size >> 1; | |
637 | 2 | unsigned int num_branches = 1; | |
638 | unsigned int branch; | ||
639 | unsigned int bit; | ||
640 | |||
641 | // prepare constants | ||
642 | 2 | const __m128i mask_stage1 = _mm_set_epi8(0x0, | |
643 | 0xFF, | ||
644 | 0x0, | ||
645 | 0xFF, | ||
646 | 0x0, | ||
647 | 0xFF, | ||
648 | 0x0, | ||
649 | 0xFF, | ||
650 | 0x0, | ||
651 | 0xFF, | ||
652 | 0x0, | ||
653 | 0xFF, | ||
654 | 0x0, | ||
655 | 0xFF, | ||
656 | 0x0, | ||
657 | 0xFF); | ||
658 | |||
659 | // get some SIMD registers to play with. | ||
660 | __m128i r_frame0, r_temp0, shifted; | ||
661 | |||
662 | { | ||
663 | __m128i r_frame1, r_temp1; | ||
664 | const __m128i shuffle_separate = | ||
665 | 2 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
666 | |||
667 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
|
26 | while (stage > 4) { |
668 | 24 | frame_ptr = frame; | |
669 | 24 | temp_ptr = temp; | |
670 | |||
671 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
672 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.
|
8214 | for (branch = 0; branch < num_branches; ++branch) { |
673 |
2/2✓ Branch 0 taken 49152 times.
✓ Branch 1 taken 8190 times.
|
57342 | for (bit = 0; bit < frame_half; bit += 16) { |
674 | 49152 | r_temp0 = _mm_load_si128((__m128i*)temp_ptr); | |
675 | 49152 | temp_ptr += 16; | |
676 | 49152 | r_temp1 = _mm_load_si128((__m128i*)temp_ptr); | |
677 | 49152 | temp_ptr += 16; | |
678 | |||
679 | 49152 | shifted = _mm_srli_si128(r_temp0, 1); | |
680 | 49152 | shifted = _mm_and_si128(shifted, mask_stage1); | |
681 | 49152 | r_temp0 = _mm_xor_si128(shifted, r_temp0); | |
682 | 49152 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); | |
683 | |||
684 | 49152 | shifted = _mm_srli_si128(r_temp1, 1); | |
685 | 49152 | shifted = _mm_and_si128(shifted, mask_stage1); | |
686 | 49152 | r_temp1 = _mm_xor_si128(shifted, r_temp1); | |
687 | 49152 | r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); | |
688 | |||
689 | 49152 | r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); | |
690 | _mm_store_si128((__m128i*)frame_ptr, r_frame0); | ||
691 | |||
692 | 49152 | r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); | |
693 | 49152 | _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1); | |
694 | 49152 | frame_ptr += 16; | |
695 | } | ||
696 | |||
697 | 8190 | frame_ptr += frame_half; | |
698 | } | ||
699 | 24 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
700 | |||
701 | 24 | num_branches = num_branches << 1; | |
702 | 24 | frame_half = frame_half >> 1; | |
703 | 24 | stage--; | |
704 | } | ||
705 | } | ||
706 | |||
707 | // This last part requires at least 16-bit frames. | ||
708 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
709 | |||
710 | // reset pointers to correct positions. | ||
711 | 2 | frame_ptr = frame; | |
712 | 2 | temp_ptr = temp; | |
713 | |||
714 | // prefetch first chunk | ||
715 | 2 | __VOLK_PREFETCH(temp_ptr); | |
716 | |||
717 | const __m128i shuffle_stage4 = | ||
718 | 2 | _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); | |
719 | 2 | const __m128i mask_stage4 = _mm_set_epi8(0x0, | |
720 | 0x0, | ||
721 | 0x0, | ||
722 | 0x0, | ||
723 | 0x0, | ||
724 | 0x0, | ||
725 | 0x0, | ||
726 | 0x0, | ||
727 | 0xFF, | ||
728 | 0xFF, | ||
729 | 0xFF, | ||
730 | 0xFF, | ||
731 | 0xFF, | ||
732 | 0xFF, | ||
733 | 0xFF, | ||
734 | 0xFF); | ||
735 | 2 | const __m128i mask_stage3 = _mm_set_epi8(0x0, | |
736 | 0x0, | ||
737 | 0x0, | ||
738 | 0x0, | ||
739 | 0xFF, | ||
740 | 0xFF, | ||
741 | 0xFF, | ||
742 | 0xFF, | ||
743 | 0x0, | ||
744 | 0x0, | ||
745 | 0x0, | ||
746 | 0x0, | ||
747 | 0xFF, | ||
748 | 0xFF, | ||
749 | 0xFF, | ||
750 | 0xFF); | ||
751 | 2 | const __m128i mask_stage2 = _mm_set_epi8(0x0, | |
752 | 0x0, | ||
753 | 0xFF, | ||
754 | 0xFF, | ||
755 | 0x0, | ||
756 | 0x0, | ||
757 | 0xFF, | ||
758 | 0xFF, | ||
759 | 0x0, | ||
760 | 0x0, | ||
761 | 0xFF, | ||
762 | 0xFF, | ||
763 | 0x0, | ||
764 | 0x0, | ||
765 | 0xFF, | ||
766 | 0xFF); | ||
767 | |||
768 |
2/2✓ Branch 0 taken 8192 times.
✓ Branch 1 taken 2 times.
|
8194 | for (branch = 0; branch < num_branches; ++branch) { |
769 | 8192 | r_temp0 = _mm_load_si128((__m128i*)temp_ptr); | |
770 | |||
771 | // prefetch next chunk | ||
772 | 8192 | temp_ptr += 16; | |
773 | 8192 | __VOLK_PREFETCH(temp_ptr); | |
774 | |||
775 | // shuffle once for bit-reversal. | ||
776 | 8192 | r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); | |
777 | |||
778 | 8192 | shifted = _mm_srli_si128(r_temp0, 8); | |
779 | 8192 | shifted = _mm_and_si128(shifted, mask_stage4); | |
780 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_temp0); | |
781 | |||
782 | 8192 | shifted = _mm_srli_si128(r_frame0, 4); | |
783 | 8192 | shifted = _mm_and_si128(shifted, mask_stage3); | |
784 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
785 | |||
786 | 8192 | shifted = _mm_srli_si128(r_frame0, 2); | |
787 | 8192 | shifted = _mm_and_si128(shifted, mask_stage2); | |
788 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
789 | |||
790 | 8192 | shifted = _mm_srli_si128(r_frame0, 1); | |
791 | 8192 | shifted = _mm_and_si128(shifted, mask_stage1); | |
792 | 8192 | r_frame0 = _mm_xor_si128(shifted, r_frame0); | |
793 | |||
794 | // store result of chunk. | ||
795 | _mm_store_si128((__m128i*)frame_ptr, r_frame0); | ||
796 | 8192 | frame_ptr += 16; | |
797 | } | ||
798 | } | ||
799 | #endif /* LV_HAVE_SSSE3 */ | ||
800 | |||
801 | #ifdef LV_HAVE_AVX2 | ||
802 | #include <immintrin.h> | ||
803 | |||
804 | 2 | static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, | |
805 | unsigned char* temp, | ||
806 | unsigned int frame_size) | ||
807 | { | ||
808 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (frame_size < 32) { |
809 | ✗ | volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); | |
810 | ✗ | return; | |
811 | } | ||
812 | |||
813 | 2 | const unsigned int po2 = log2_of_power_of_2(frame_size); | |
814 | |||
815 | 2 | unsigned int stage = po2; | |
816 | 2 | unsigned char* frame_ptr = frame; | |
817 | 2 | unsigned char* temp_ptr = temp; | |
818 | |||
819 | 2 | unsigned int frame_half = frame_size >> 1; | |
820 | 2 | unsigned int num_branches = 1; | |
821 | unsigned int branch; | ||
822 | unsigned int bit; | ||
823 | |||
824 | // prepare constants | ||
825 | 2 | const __m256i mask_stage1 = _mm256_set_epi8(0x0, | |
826 | 0xFF, | ||
827 | 0x0, | ||
828 | 0xFF, | ||
829 | 0x0, | ||
830 | 0xFF, | ||
831 | 0x0, | ||
832 | 0xFF, | ||
833 | 0x0, | ||
834 | 0xFF, | ||
835 | 0x0, | ||
836 | 0xFF, | ||
837 | 0x0, | ||
838 | 0xFF, | ||
839 | 0x0, | ||
840 | 0xFF, | ||
841 | 0x0, | ||
842 | 0xFF, | ||
843 | 0x0, | ||
844 | 0xFF, | ||
845 | 0x0, | ||
846 | 0xFF, | ||
847 | 0x0, | ||
848 | 0xFF, | ||
849 | 0x0, | ||
850 | 0xFF, | ||
851 | 0x0, | ||
852 | 0xFF, | ||
853 | 0x0, | ||
854 | 0xFF, | ||
855 | 0x0, | ||
856 | 0xFF); | ||
857 | |||
858 | 2 | const __m128i mask_stage0 = _mm_set_epi8(0x0, | |
859 | 0xFF, | ||
860 | 0x0, | ||
861 | 0xFF, | ||
862 | 0x0, | ||
863 | 0xFF, | ||
864 | 0x0, | ||
865 | 0xFF, | ||
866 | 0x0, | ||
867 | 0xFF, | ||
868 | 0x0, | ||
869 | 0xFF, | ||
870 | 0x0, | ||
871 | 0xFF, | ||
872 | 0x0, | ||
873 | 0xFF); | ||
874 | // get some SIMD registers to play with. | ||
875 | __m256i r_frame0, r_temp0, shifted; | ||
876 | __m128i r_temp2, r_frame2, shifted2; | ||
877 | { | ||
878 | __m256i r_frame1, r_temp1; | ||
879 | __m128i r_frame3, r_temp3; | ||
880 | 2 | const __m256i shuffle_separate = _mm256_setr_epi8(0, | |
881 | 2, | ||
882 | 4, | ||
883 | 6, | ||
884 | 8, | ||
885 | 10, | ||
886 | 12, | ||
887 | 14, | ||
888 | 1, | ||
889 | 3, | ||
890 | 5, | ||
891 | 7, | ||
892 | 9, | ||
893 | 11, | ||
894 | 13, | ||
895 | 15, | ||
896 | 0, | ||
897 | 2, | ||
898 | 4, | ||
899 | 6, | ||
900 | 8, | ||
901 | 10, | ||
902 | 12, | ||
903 | 14, | ||
904 | 1, | ||
905 | 3, | ||
906 | 5, | ||
907 | 7, | ||
908 | 9, | ||
909 | 11, | ||
910 | 13, | ||
911 | 15); | ||
912 | const __m128i shuffle_separate128 = | ||
913 | 2 | _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); | |
914 | |||
915 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
|
26 | while (stage > 4) { |
916 | 24 | frame_ptr = frame; | |
917 | 24 | temp_ptr = temp; | |
918 | |||
919 | // for stage = 5 a branch has 32 elements. So upper stages are even bigger. | ||
920 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 24 times.
|
8214 | for (branch = 0; branch < num_branches; ++branch) { |
921 |
2/2✓ Branch 0 taken 26624 times.
✓ Branch 1 taken 4094 times.
|
30718 | for (bit = 0; bit < frame_half; bit += 32) { |
922 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 22528 times.
|
26624 | if ((frame_half - bit) < |
923 | 32) // if only 16 bits remaining in frame, not 32 | ||
924 | { | ||
925 | 4096 | r_temp2 = _mm_load_si128((__m128i*)temp_ptr); | |
926 | 4096 | temp_ptr += 16; | |
927 | 4096 | r_temp3 = _mm_load_si128((__m128i*)temp_ptr); | |
928 | 4096 | temp_ptr += 16; | |
929 | |||
930 | 4096 | shifted2 = _mm_srli_si128(r_temp2, 1); | |
931 | 4096 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
932 | 4096 | r_temp2 = _mm_xor_si128(shifted2, r_temp2); | |
933 | 4096 | r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); | |
934 | |||
935 | 4096 | shifted2 = _mm_srli_si128(r_temp3, 1); | |
936 | 4096 | shifted2 = _mm_and_si128(shifted2, mask_stage0); | |
937 | 4096 | r_temp3 = _mm_xor_si128(shifted2, r_temp3); | |
938 | 4096 | r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); | |
939 | |||
940 | 4096 | r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); | |
941 | _mm_store_si128((__m128i*)frame_ptr, r_frame2); | ||
942 | |||
943 | 4096 | r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); | |
944 | 4096 | _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3); | |
945 | 4096 | frame_ptr += 16; | |
946 | 4096 | break; | |
947 | } | ||
948 | 22528 | r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); | |
949 | 22528 | temp_ptr += 32; | |
950 | 22528 | r_temp1 = _mm256_load_si256((__m256i*)temp_ptr); | |
951 | 22528 | temp_ptr += 32; | |
952 | |||
953 | 22528 | shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes | |
954 | 22528 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
955 | 22528 | r_temp0 = _mm256_xor_si256(shifted, r_temp0); | |
956 | 22528 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); | |
957 | |||
958 | 22528 | shifted = _mm256_srli_si256(r_temp1, 1); | |
959 | 22528 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
960 | 22528 | r_temp1 = _mm256_xor_si256(shifted, r_temp1); | |
961 | 22528 | r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); | |
962 | |||
963 | 22528 | r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); | |
964 | 22528 | r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); | |
965 | 22528 | r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); | |
966 | 22528 | r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); | |
967 | |||
968 | _mm256_store_si256((__m256i*)frame_ptr, r_frame0); | ||
969 | |||
970 | 22528 | _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1); | |
971 | 22528 | frame_ptr += 32; | |
972 | } | ||
973 | |||
974 | 8190 | frame_ptr += frame_half; | |
975 | } | ||
976 | 24 | memcpy(temp, frame, sizeof(unsigned char) * frame_size); | |
977 | |||
978 | 24 | num_branches = num_branches << 1; | |
979 | 24 | frame_half = frame_half >> 1; | |
980 | 24 | stage--; | |
981 | } | ||
982 | } | ||
983 | |||
984 | // This last part requires at least 32-bit frames. | ||
985 | // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! | ||
986 | |||
987 | // reset pointers to correct positions. | ||
988 | 2 | frame_ptr = frame; | |
989 | 2 | temp_ptr = temp; | |
990 | |||
991 | // prefetch first chunk. | ||
992 | 2 | __VOLK_PREFETCH(temp_ptr); | |
993 | |||
994 | 2 | const __m256i shuffle_stage4 = _mm256_setr_epi8(0, | |
995 | 8, | ||
996 | 4, | ||
997 | 12, | ||
998 | 2, | ||
999 | 10, | ||
1000 | 6, | ||
1001 | 14, | ||
1002 | 1, | ||
1003 | 9, | ||
1004 | 5, | ||
1005 | 13, | ||
1006 | 3, | ||
1007 | 11, | ||
1008 | 7, | ||
1009 | 15, | ||
1010 | 0, | ||
1011 | 8, | ||
1012 | 4, | ||
1013 | 12, | ||
1014 | 2, | ||
1015 | 10, | ||
1016 | 6, | ||
1017 | 14, | ||
1018 | 1, | ||
1019 | 9, | ||
1020 | 5, | ||
1021 | 13, | ||
1022 | 3, | ||
1023 | 11, | ||
1024 | 7, | ||
1025 | 15); | ||
1026 | 2 | const __m256i mask_stage4 = _mm256_set_epi8(0x0, | |
1027 | 0x0, | ||
1028 | 0x0, | ||
1029 | 0x0, | ||
1030 | 0x0, | ||
1031 | 0x0, | ||
1032 | 0x0, | ||
1033 | 0x0, | ||
1034 | 0xFF, | ||
1035 | 0xFF, | ||
1036 | 0xFF, | ||
1037 | 0xFF, | ||
1038 | 0xFF, | ||
1039 | 0xFF, | ||
1040 | 0xFF, | ||
1041 | 0xFF, | ||
1042 | 0x0, | ||
1043 | 0x0, | ||
1044 | 0x0, | ||
1045 | 0x0, | ||
1046 | 0x0, | ||
1047 | 0x0, | ||
1048 | 0x0, | ||
1049 | 0x0, | ||
1050 | 0xFF, | ||
1051 | 0xFF, | ||
1052 | 0xFF, | ||
1053 | 0xFF, | ||
1054 | 0xFF, | ||
1055 | 0xFF, | ||
1056 | 0xFF, | ||
1057 | 0xFF); | ||
1058 | 2 | const __m256i mask_stage3 = _mm256_set_epi8(0x0, | |
1059 | 0x0, | ||
1060 | 0x0, | ||
1061 | 0x0, | ||
1062 | 0xFF, | ||
1063 | 0xFF, | ||
1064 | 0xFF, | ||
1065 | 0xFF, | ||
1066 | 0x0, | ||
1067 | 0x0, | ||
1068 | 0x0, | ||
1069 | 0x0, | ||
1070 | 0xFF, | ||
1071 | 0xFF, | ||
1072 | 0xFF, | ||
1073 | 0xFF, | ||
1074 | 0x0, | ||
1075 | 0x0, | ||
1076 | 0x0, | ||
1077 | 0x0, | ||
1078 | 0xFF, | ||
1079 | 0xFF, | ||
1080 | 0xFF, | ||
1081 | 0xFF, | ||
1082 | 0x0, | ||
1083 | 0x0, | ||
1084 | 0x0, | ||
1085 | 0x0, | ||
1086 | 0xFF, | ||
1087 | 0xFF, | ||
1088 | 0xFF, | ||
1089 | 0xFF); | ||
1090 | 2 | const __m256i mask_stage2 = _mm256_set_epi8(0x0, | |
1091 | 0x0, | ||
1092 | 0xFF, | ||
1093 | 0xFF, | ||
1094 | 0x0, | ||
1095 | 0x0, | ||
1096 | 0xFF, | ||
1097 | 0xFF, | ||
1098 | 0x0, | ||
1099 | 0x0, | ||
1100 | 0xFF, | ||
1101 | 0xFF, | ||
1102 | 0x0, | ||
1103 | 0x0, | ||
1104 | 0xFF, | ||
1105 | 0xFF, | ||
1106 | 0x0, | ||
1107 | 0x0, | ||
1108 | 0xFF, | ||
1109 | 0xFF, | ||
1110 | 0x0, | ||
1111 | 0x0, | ||
1112 | 0xFF, | ||
1113 | 0xFF, | ||
1114 | 0x0, | ||
1115 | 0x0, | ||
1116 | 0xFF, | ||
1117 | 0xFF, | ||
1118 | 0x0, | ||
1119 | 0x0, | ||
1120 | 0xFF, | ||
1121 | 0xFF); | ||
1122 | |||
1123 |
2/2✓ Branch 0 taken 4096 times.
✓ Branch 1 taken 2 times.
|
4098 | for (branch = 0; branch < num_branches / 2; ++branch) { |
1124 | 4096 | r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); | |
1125 | |||
1126 | // prefetch next chunk | ||
1127 | 4096 | temp_ptr += 32; | |
1128 | 4096 | __VOLK_PREFETCH(temp_ptr); | |
1129 | |||
1130 | // shuffle once for bit-reversal. | ||
1131 | 4096 | r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); | |
1132 | |||
1133 | 4096 | shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes | |
1134 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage4); | |
1135 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_temp0); | |
1136 | |||
1137 | 4096 | shifted = _mm256_srli_si256(r_frame0, 4); | |
1138 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage3); | |
1139 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
1140 | |||
1141 | 4096 | shifted = _mm256_srli_si256(r_frame0, 2); | |
1142 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage2); | |
1143 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
1144 | |||
1145 | 4096 | shifted = _mm256_srli_si256(r_frame0, 1); | |
1146 | 4096 | shifted = _mm256_and_si256(shifted, mask_stage1); | |
1147 | 4096 | r_frame0 = _mm256_xor_si256(shifted, r_frame0); | |
1148 | |||
1149 | // store result of chunk. | ||
1150 | _mm256_store_si256((__m256i*)frame_ptr, r_frame0); | ||
1151 | 4096 | frame_ptr += 32; | |
1152 | } | ||
1153 | } | ||
1154 | #endif /* LV_HAVE_AVX2 */ | ||
1155 | |||
1156 | |||
1157 | #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */ | ||
1158 |