GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16i_x4_quad_max_star_16i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 0 61 0.0%
Functions: 0 2 0.0%
Branches: 0 18 0.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16i_x4_quad_max_star_16i
12 *
13 * \b Overview
14 *
15 * <FIXME>
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short*
20 * src2, short* src3, unsigned int num_points) \endcode
21 *
22 * \b Inputs
23 * \li src0: The input vector 0.
24 * \li src1: The input vector 1.
25 * \li src2: The input vector 2.
26 * \li src3: The input vector 3.
27 * \li num_points: The number of data points.
28 *
29 * \b Outputs
30 * \li target: The output value.
31 *
32 * \b Example
33 * \code
34 * int N = 10000;
35 *
36 * volk_16i_x4_quad_max_star_16i();
37 *
38 * volk_free(x);
39 * \endcode
40 */
41
42 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
43 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
44
45 #include <inttypes.h>
46 #include <stdio.h>
47
48 #ifdef LV_HAVE_SSE2
49
50 #include <emmintrin.h>
51
52 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
53 short* src0,
54 short* src1,
55 short* src2,
56 short* src3,
57 unsigned int num_points)
58 {
59 const unsigned int num_bytes = num_points * 2;
60
61 int i = 0;
62
63 int bound = (num_bytes >> 4);
64 int bound_copy = bound;
65 int leftovers = (num_bytes >> 1) & 7;
66
67 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
68 p_target = (__m128i*)target;
69 p_src0 = (__m128i*)src0;
70 p_src1 = (__m128i*)src1;
71 p_src2 = (__m128i*)src2;
72 p_src3 = (__m128i*)src3;
73
74 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
75
76 while (bound_copy > 0) {
77 xmm1 = _mm_load_si128(p_src0);
78 xmm2 = _mm_load_si128(p_src1);
79 xmm3 = _mm_load_si128(p_src2);
80 xmm4 = _mm_load_si128(p_src3);
81
82 xmm5 = _mm_setzero_si128();
83 xmm6 = _mm_setzero_si128();
84 xmm7 = xmm1;
85 xmm8 = xmm3;
86
87 xmm1 = _mm_sub_epi16(xmm2, xmm1);
88
89 xmm3 = _mm_sub_epi16(xmm4, xmm3);
90
91 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
92 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
93
94 xmm2 = _mm_and_si128(xmm5, xmm2);
95 xmm4 = _mm_and_si128(xmm6, xmm4);
96 xmm5 = _mm_andnot_si128(xmm5, xmm7);
97 xmm6 = _mm_andnot_si128(xmm6, xmm8);
98
99 xmm5 = _mm_add_epi16(xmm2, xmm5);
100 xmm6 = _mm_add_epi16(xmm4, xmm6);
101
102 xmm1 = _mm_xor_si128(xmm1, xmm1);
103 xmm2 = xmm5;
104 xmm5 = _mm_sub_epi16(xmm6, xmm5);
105 p_src0 += 1;
106 bound_copy -= 1;
107
108 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
109 p_src1 += 1;
110
111 xmm6 = _mm_and_si128(xmm1, xmm6);
112
113 xmm1 = _mm_andnot_si128(xmm1, xmm2);
114 p_src2 += 1;
115
116 xmm1 = _mm_add_epi16(xmm6, xmm1);
117 p_src3 += 1;
118
119 _mm_store_si128(p_target, xmm1);
120 p_target += 1;
121 }
122
123
124 /*__VOLK_ASM __VOLK_VOLATILE
125 (
126 "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
127 "cmp $0, %[bound]\n\t"
128 "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
129
130 "movaps (%[src0]), %%xmm1\n\t"
131 "movaps (%[src1]), %%xmm2\n\t"
132 "movaps (%[src2]), %%xmm3\n\t"
133 "movaps (%[src3]), %%xmm4\n\t"
134
135 "pxor %%xmm5, %%xmm5\n\t"
136 "pxor %%xmm6, %%xmm6\n\t"
137 "movaps %%xmm1, %%xmm7\n\t"
138 "movaps %%xmm3, %%xmm8\n\t"
139 "psubw %%xmm2, %%xmm1\n\t"
140 "psubw %%xmm4, %%xmm3\n\t"
141
142 "pcmpgtw %%xmm1, %%xmm5\n\t"
143 "pcmpgtw %%xmm3, %%xmm6\n\t"
144
145 "pand %%xmm5, %%xmm2\n\t"
146 "pand %%xmm6, %%xmm4\n\t"
147 "pandn %%xmm7, %%xmm5\n\t"
148 "pandn %%xmm8, %%xmm6\n\t"
149
150 "paddw %%xmm2, %%xmm5\n\t"
151 "paddw %%xmm4, %%xmm6\n\t"
152
153 "pxor %%xmm1, %%xmm1\n\t"
154 "movaps %%xmm5, %%xmm2\n\t"
155
156 "psubw %%xmm6, %%xmm5\n\t"
157 "add $16, %[src0]\n\t"
158 "add $-1, %[bound]\n\t"
159
160 "pcmpgtw %%xmm5, %%xmm1\n\t"
161 "add $16, %[src1]\n\t"
162
163 "pand %%xmm1, %%xmm6\n\t"
164
165 "pandn %%xmm2, %%xmm1\n\t"
166 "add $16, %[src2]\n\t"
167
168 "paddw %%xmm6, %%xmm1\n\t"
169 "add $16, %[src3]\n\t"
170
171 "movaps %%xmm1, (%[target])\n\t"
172 "addw $16, %[target]\n\t"
173 "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
174
175 "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
176 :
177 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
178 [src3]"r"(src3), [target]"r"(target)
179 :
180 );
181 */
182
183 short temp0 = 0;
184 short temp1 = 0;
185 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
186 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
187 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
188 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
189 }
190 return;
191 }
192
193 #endif /*LV_HAVE_SSE2*/
194
195 #ifdef LV_HAVE_NEON
196
197 #include <arm_neon.h>
198
199 static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
200 short* src0,
201 short* src1,
202 short* src2,
203 short* src3,
204 unsigned int num_points)
205 {
206 const unsigned int eighth_points = num_points / 8;
207 unsigned i;
208
209 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
210 int16x8_t diff12, diff34;
211 int16x8_t comp0, comp1, comp2, comp3;
212 int16x8_t result1_vec, result2_vec;
213 int16x8_t zeros;
214 zeros = vdupq_n_s16(0);
215 for (i = 0; i < eighth_points; ++i) {
216 src0_vec = vld1q_s16(src0);
217 src1_vec = vld1q_s16(src1);
218 src2_vec = vld1q_s16(src2);
219 src3_vec = vld1q_s16(src3);
220 diff12 = vsubq_s16(src0_vec, src1_vec);
221 diff34 = vsubq_s16(src2_vec, src3_vec);
222 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
223 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
224 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
225 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
226 comp0 = vandq_s16(src0_vec, comp0);
227 comp1 = vandq_s16(src1_vec, comp1);
228 comp2 = vandq_s16(src2_vec, comp2);
229 comp3 = vandq_s16(src3_vec, comp3);
230
231 result1_vec = vaddq_s16(comp0, comp1);
232 result2_vec = vaddq_s16(comp2, comp3);
233
234 diff12 = vsubq_s16(result1_vec, result2_vec);
235 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
236 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
237 comp0 = vandq_s16(result1_vec, comp0);
238 comp1 = vandq_s16(result2_vec, comp1);
239 result1_vec = vaddq_s16(comp0, comp1);
240 vst1q_s16(target, result1_vec);
241 src0 += 8;
242 src1 += 8;
243 src2 += 8;
244 src3 += 8;
245 target += 8;
246 }
247
248 short temp0 = 0;
249 short temp1 = 0;
250 for (i = eighth_points * 8; i < num_points; ++i) {
251 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
252 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
253 *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
254 src0++;
255 src1++;
256 src2++;
257 src3++;
258 }
259 }
260 #endif /* LV_HAVE_NEON */
261
262
263 #ifdef LV_HAVE_GENERIC
264 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
265 short* src0,
266 short* src1,
267 short* src2,
268 short* src3,
269 unsigned int num_points)
270 {
271 const unsigned int num_bytes = num_points * 2;
272
273 int i = 0;
274
275 int bound = num_bytes >> 1;
276
277 short temp0 = 0;
278 short temp1 = 0;
279 for (i = 0; i < bound; ++i) {
280 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
281 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
282 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
283 }
284 }
285
286 #endif /*LV_HAVE_GENERIC*/
287
288 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
289