Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2012, 2014 Free Software Foundation, Inc. |
4 |
|
|
* |
5 |
|
|
* This file is part of VOLK |
6 |
|
|
* |
7 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
/*! |
11 |
|
|
* \page volk_16i_x4_quad_max_star_16i |
12 |
|
|
* |
13 |
|
|
* \b Overview |
14 |
|
|
* |
15 |
|
|
* <FIXME> |
16 |
|
|
* |
17 |
|
|
* <b>Dispatcher Prototype</b> |
18 |
|
|
* \code |
19 |
|
|
* void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* |
20 |
|
|
* src2, short* src3, unsigned int num_points) \endcode |
21 |
|
|
* |
22 |
|
|
* \b Inputs |
23 |
|
|
* \li src0: The input vector 0. |
24 |
|
|
* \li src1: The input vector 1. |
25 |
|
|
* \li src2: The input vector 2. |
26 |
|
|
* \li src3: The input vector 3. |
27 |
|
|
* \li num_points: The number of data points. |
28 |
|
|
* |
29 |
|
|
* \b Outputs |
30 |
|
|
* \li target: The output value. |
31 |
|
|
* |
32 |
|
|
* \b Example |
33 |
|
|
* \code |
34 |
|
|
* int N = 10000; |
35 |
|
|
* |
36 |
|
|
* volk_16i_x4_quad_max_star_16i(); |
37 |
|
|
* |
38 |
|
|
* volk_free(x); |
39 |
|
|
* \endcode |
40 |
|
|
*/ |
41 |
|
|
|
42 |
|
|
#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H |
43 |
|
|
#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H |
44 |
|
|
|
45 |
|
|
#include <inttypes.h> |
46 |
|
|
#include <stdio.h> |
47 |
|
|
|
48 |
|
|
#ifdef LV_HAVE_SSE2 |
49 |
|
|
|
50 |
|
|
#include <emmintrin.h> |
51 |
|
|
|
52 |
|
✗ |
static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, |
53 |
|
|
short* src0, |
54 |
|
|
short* src1, |
55 |
|
|
short* src2, |
56 |
|
|
short* src3, |
57 |
|
|
unsigned int num_points) |
58 |
|
|
{ |
59 |
|
✗ |
const unsigned int num_bytes = num_points * 2; |
60 |
|
|
|
61 |
|
✗ |
int i = 0; |
62 |
|
|
|
63 |
|
✗ |
int bound = (num_bytes >> 4); |
64 |
|
✗ |
int bound_copy = bound; |
65 |
|
✗ |
int leftovers = (num_bytes >> 1) & 7; |
66 |
|
|
|
67 |
|
|
__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; |
68 |
|
✗ |
p_target = (__m128i*)target; |
69 |
|
✗ |
p_src0 = (__m128i*)src0; |
70 |
|
✗ |
p_src1 = (__m128i*)src1; |
71 |
|
✗ |
p_src2 = (__m128i*)src2; |
72 |
|
✗ |
p_src3 = (__m128i*)src3; |
73 |
|
|
|
74 |
|
|
__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; |
75 |
|
|
|
76 |
|
✗ |
while (bound_copy > 0) { |
77 |
|
✗ |
xmm1 = _mm_load_si128(p_src0); |
78 |
|
✗ |
xmm2 = _mm_load_si128(p_src1); |
79 |
|
✗ |
xmm3 = _mm_load_si128(p_src2); |
80 |
|
✗ |
xmm4 = _mm_load_si128(p_src3); |
81 |
|
|
|
82 |
|
✗ |
xmm5 = _mm_setzero_si128(); |
83 |
|
✗ |
xmm6 = _mm_setzero_si128(); |
84 |
|
✗ |
xmm7 = xmm1; |
85 |
|
✗ |
xmm8 = xmm3; |
86 |
|
|
|
87 |
|
✗ |
xmm1 = _mm_sub_epi16(xmm2, xmm1); |
88 |
|
|
|
89 |
|
✗ |
xmm3 = _mm_sub_epi16(xmm4, xmm3); |
90 |
|
|
|
91 |
|
✗ |
xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); |
92 |
|
✗ |
xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); |
93 |
|
|
|
94 |
|
✗ |
xmm2 = _mm_and_si128(xmm5, xmm2); |
95 |
|
✗ |
xmm4 = _mm_and_si128(xmm6, xmm4); |
96 |
|
✗ |
xmm5 = _mm_andnot_si128(xmm5, xmm7); |
97 |
|
✗ |
xmm6 = _mm_andnot_si128(xmm6, xmm8); |
98 |
|
|
|
99 |
|
✗ |
xmm5 = _mm_add_epi16(xmm2, xmm5); |
100 |
|
✗ |
xmm6 = _mm_add_epi16(xmm4, xmm6); |
101 |
|
|
|
102 |
|
✗ |
xmm1 = _mm_xor_si128(xmm1, xmm1); |
103 |
|
✗ |
xmm2 = xmm5; |
104 |
|
✗ |
xmm5 = _mm_sub_epi16(xmm6, xmm5); |
105 |
|
✗ |
p_src0 += 1; |
106 |
|
✗ |
bound_copy -= 1; |
107 |
|
|
|
108 |
|
✗ |
xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); |
109 |
|
✗ |
p_src1 += 1; |
110 |
|
|
|
111 |
|
✗ |
xmm6 = _mm_and_si128(xmm1, xmm6); |
112 |
|
|
|
113 |
|
✗ |
xmm1 = _mm_andnot_si128(xmm1, xmm2); |
114 |
|
✗ |
p_src2 += 1; |
115 |
|
|
|
116 |
|
✗ |
xmm1 = _mm_add_epi16(xmm6, xmm1); |
117 |
|
✗ |
p_src3 += 1; |
118 |
|
|
|
119 |
|
|
_mm_store_si128(p_target, xmm1); |
120 |
|
✗ |
p_target += 1; |
121 |
|
|
} |
122 |
|
|
|
123 |
|
|
|
124 |
|
|
/*__VOLK_ASM __VOLK_VOLATILE |
125 |
|
|
( |
126 |
|
|
"volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" |
127 |
|
|
"cmp $0, %[bound]\n\t" |
128 |
|
|
"je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" |
129 |
|
|
|
130 |
|
|
"movaps (%[src0]), %%xmm1\n\t" |
131 |
|
|
"movaps (%[src1]), %%xmm2\n\t" |
132 |
|
|
"movaps (%[src2]), %%xmm3\n\t" |
133 |
|
|
"movaps (%[src3]), %%xmm4\n\t" |
134 |
|
|
|
135 |
|
|
"pxor %%xmm5, %%xmm5\n\t" |
136 |
|
|
"pxor %%xmm6, %%xmm6\n\t" |
137 |
|
|
"movaps %%xmm1, %%xmm7\n\t" |
138 |
|
|
"movaps %%xmm3, %%xmm8\n\t" |
139 |
|
|
"psubw %%xmm2, %%xmm1\n\t" |
140 |
|
|
"psubw %%xmm4, %%xmm3\n\t" |
141 |
|
|
|
142 |
|
|
"pcmpgtw %%xmm1, %%xmm5\n\t" |
143 |
|
|
"pcmpgtw %%xmm3, %%xmm6\n\t" |
144 |
|
|
|
145 |
|
|
"pand %%xmm5, %%xmm2\n\t" |
146 |
|
|
"pand %%xmm6, %%xmm4\n\t" |
147 |
|
|
"pandn %%xmm7, %%xmm5\n\t" |
148 |
|
|
"pandn %%xmm8, %%xmm6\n\t" |
149 |
|
|
|
150 |
|
|
"paddw %%xmm2, %%xmm5\n\t" |
151 |
|
|
"paddw %%xmm4, %%xmm6\n\t" |
152 |
|
|
|
153 |
|
|
"pxor %%xmm1, %%xmm1\n\t" |
154 |
|
|
"movaps %%xmm5, %%xmm2\n\t" |
155 |
|
|
|
156 |
|
|
"psubw %%xmm6, %%xmm5\n\t" |
157 |
|
|
"add $16, %[src0]\n\t" |
158 |
|
|
"add $-1, %[bound]\n\t" |
159 |
|
|
|
160 |
|
|
"pcmpgtw %%xmm5, %%xmm1\n\t" |
161 |
|
|
"add $16, %[src1]\n\t" |
162 |
|
|
|
163 |
|
|
"pand %%xmm1, %%xmm6\n\t" |
164 |
|
|
|
165 |
|
|
"pandn %%xmm2, %%xmm1\n\t" |
166 |
|
|
"add $16, %[src2]\n\t" |
167 |
|
|
|
168 |
|
|
"paddw %%xmm6, %%xmm1\n\t" |
169 |
|
|
"add $16, %[src3]\n\t" |
170 |
|
|
|
171 |
|
|
"movaps %%xmm1, (%[target])\n\t" |
172 |
|
|
"addw $16, %[target]\n\t" |
173 |
|
|
"jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" |
174 |
|
|
|
175 |
|
|
"volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" |
176 |
|
|
: |
177 |
|
|
:[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), |
178 |
|
|
[src3]"r"(src3), [target]"r"(target) |
179 |
|
|
: |
180 |
|
|
); |
181 |
|
|
*/ |
182 |
|
|
|
183 |
|
✗ |
short temp0 = 0; |
184 |
|
✗ |
short temp1 = 0; |
185 |
|
✗ |
for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { |
186 |
|
✗ |
temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; |
187 |
|
✗ |
temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; |
188 |
|
✗ |
target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; |
189 |
|
|
} |
190 |
|
✗ |
return; |
191 |
|
|
} |
192 |
|
|
|
193 |
|
|
#endif /*LV_HAVE_SSE2*/ |
194 |
|
|
|
195 |
|
|
#ifdef LV_HAVE_NEON |
196 |
|
|
|
197 |
|
|
#include <arm_neon.h> |
198 |
|
|
|
199 |
|
|
static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, |
200 |
|
|
short* src0, |
201 |
|
|
short* src1, |
202 |
|
|
short* src2, |
203 |
|
|
short* src3, |
204 |
|
|
unsigned int num_points) |
205 |
|
|
{ |
206 |
|
|
const unsigned int eighth_points = num_points / 8; |
207 |
|
|
unsigned i; |
208 |
|
|
|
209 |
|
|
int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; |
210 |
|
|
int16x8_t diff12, diff34; |
211 |
|
|
int16x8_t comp0, comp1, comp2, comp3; |
212 |
|
|
int16x8_t result1_vec, result2_vec; |
213 |
|
|
int16x8_t zeros; |
214 |
|
|
zeros = vdupq_n_s16(0); |
215 |
|
|
for (i = 0; i < eighth_points; ++i) { |
216 |
|
|
src0_vec = vld1q_s16(src0); |
217 |
|
|
src1_vec = vld1q_s16(src1); |
218 |
|
|
src2_vec = vld1q_s16(src2); |
219 |
|
|
src3_vec = vld1q_s16(src3); |
220 |
|
|
diff12 = vsubq_s16(src0_vec, src1_vec); |
221 |
|
|
diff34 = vsubq_s16(src2_vec, src3_vec); |
222 |
|
|
comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); |
223 |
|
|
comp1 = (int16x8_t)vcltq_s16(diff12, zeros); |
224 |
|
|
comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); |
225 |
|
|
comp3 = (int16x8_t)vcltq_s16(diff34, zeros); |
226 |
|
|
comp0 = vandq_s16(src0_vec, comp0); |
227 |
|
|
comp1 = vandq_s16(src1_vec, comp1); |
228 |
|
|
comp2 = vandq_s16(src2_vec, comp2); |
229 |
|
|
comp3 = vandq_s16(src3_vec, comp3); |
230 |
|
|
|
231 |
|
|
result1_vec = vaddq_s16(comp0, comp1); |
232 |
|
|
result2_vec = vaddq_s16(comp2, comp3); |
233 |
|
|
|
234 |
|
|
diff12 = vsubq_s16(result1_vec, result2_vec); |
235 |
|
|
comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); |
236 |
|
|
comp1 = (int16x8_t)vcltq_s16(diff12, zeros); |
237 |
|
|
comp0 = vandq_s16(result1_vec, comp0); |
238 |
|
|
comp1 = vandq_s16(result2_vec, comp1); |
239 |
|
|
result1_vec = vaddq_s16(comp0, comp1); |
240 |
|
|
vst1q_s16(target, result1_vec); |
241 |
|
|
src0 += 8; |
242 |
|
|
src1 += 8; |
243 |
|
|
src2 += 8; |
244 |
|
|
src3 += 8; |
245 |
|
|
target += 8; |
246 |
|
|
} |
247 |
|
|
|
248 |
|
|
short temp0 = 0; |
249 |
|
|
short temp1 = 0; |
250 |
|
|
for (i = eighth_points * 8; i < num_points; ++i) { |
251 |
|
|
temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; |
252 |
|
|
temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; |
253 |
|
|
*target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; |
254 |
|
|
src0++; |
255 |
|
|
src1++; |
256 |
|
|
src2++; |
257 |
|
|
src3++; |
258 |
|
|
} |
259 |
|
|
} |
260 |
|
|
#endif /* LV_HAVE_NEON */ |
261 |
|
|
|
262 |
|
|
|
263 |
|
|
#ifdef LV_HAVE_GENERIC |
264 |
|
✗ |
static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, |
265 |
|
|
short* src0, |
266 |
|
|
short* src1, |
267 |
|
|
short* src2, |
268 |
|
|
short* src3, |
269 |
|
|
unsigned int num_points) |
270 |
|
|
{ |
271 |
|
✗ |
const unsigned int num_bytes = num_points * 2; |
272 |
|
|
|
273 |
|
✗ |
int i = 0; |
274 |
|
|
|
275 |
|
✗ |
int bound = num_bytes >> 1; |
276 |
|
|
|
277 |
|
✗ |
short temp0 = 0; |
278 |
|
✗ |
short temp1 = 0; |
279 |
|
✗ |
for (i = 0; i < bound; ++i) { |
280 |
|
✗ |
temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; |
281 |
|
✗ |
temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; |
282 |
|
✗ |
target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; |
283 |
|
|
} |
284 |
|
✗ |
} |
285 |
|
|
|
286 |
|
|
#endif /*LV_HAVE_GENERIC*/ |
287 |
|
|
|
288 |
|
|
#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/ |
289 |
|
|
|