Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2012, 2014 Free Software Foundation, Inc. |
4 |
|
|
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com> |
5 |
|
|
* |
6 |
|
|
* This file is part of VOLK |
7 |
|
|
* |
8 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
9 |
|
|
*/ |
10 |
|
|
|
11 |
|
|
/*! |
12 |
|
|
* \page volk_16i_max_star_horizontal_16i |
13 |
|
|
* |
14 |
|
|
* \b Overview |
15 |
|
|
* |
16 |
|
|
* <FIXME> |
17 |
|
|
* |
18 |
|
|
* <b>Dispatcher Prototype</b> |
19 |
|
|
* \code |
20 |
|
|
* void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int |
21 |
|
|
* num_points); \endcode |
22 |
|
|
* |
23 |
|
|
* \b Inputs |
24 |
|
|
* \li src0: The input vector. |
25 |
|
|
* \li num_points: The number of complex data points. |
26 |
|
|
* |
27 |
|
|
* \b Outputs |
28 |
|
|
* \li target: The output value of the max* operation. |
29 |
|
|
* |
30 |
|
|
* \b Example |
31 |
|
|
* \code |
32 |
|
|
* int N = 10000; |
33 |
|
|
* |
34 |
|
|
* volk_16i_max_star_horizontal_16i(); |
35 |
|
|
* |
36 |
|
|
* volk_free(x); |
37 |
|
|
* volk_free(t); |
38 |
|
|
* \endcode |
39 |
|
|
*/ |
40 |
|
|
|
41 |
|
|
#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H |
42 |
|
|
#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H |
43 |
|
|
|
44 |
|
|
#include <volk/volk_common.h> |
45 |
|
|
|
46 |
|
|
#include <inttypes.h> |
47 |
|
|
#include <stdio.h> |
48 |
|
|
|
49 |
|
|
|
50 |
|
|
#ifdef LV_HAVE_SSSE3 |
51 |
|
|
|
52 |
|
|
#include <emmintrin.h> |
53 |
|
|
#include <tmmintrin.h> |
54 |
|
|
#include <xmmintrin.h> |
55 |
|
|
|
56 |
|
✗ |
static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, |
57 |
|
|
int16_t* src0, |
58 |
|
|
unsigned int num_points) |
59 |
|
|
{ |
60 |
|
✗ |
const unsigned int num_bytes = num_points * 2; |
61 |
|
|
|
62 |
|
|
static const uint8_t shufmask0[16] = { |
63 |
|
|
0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, |
64 |
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
65 |
|
|
}; |
66 |
|
|
static const uint8_t shufmask1[16] = { |
67 |
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
68 |
|
|
0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d |
69 |
|
|
}; |
70 |
|
|
static const uint8_t andmask0[16] = { |
71 |
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, |
72 |
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
73 |
|
|
}; |
74 |
|
|
static const uint8_t andmask1[16] = { |
75 |
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
76 |
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 |
77 |
|
|
}; |
78 |
|
|
|
79 |
|
✗ |
__m128i xmm0 = {}, xmm1 = {}, xmm2 = {}, xmm3 = {}, xmm4 = {}; |
80 |
|
✗ |
__m128i xmm5 = {}, xmm6 = {}, xmm7 = {}, xmm8 = {}; |
81 |
|
|
|
82 |
|
✗ |
xmm4 = _mm_load_si128((__m128i*)shufmask0); |
83 |
|
✗ |
xmm5 = _mm_load_si128((__m128i*)shufmask1); |
84 |
|
✗ |
xmm6 = _mm_load_si128((__m128i*)andmask0); |
85 |
|
✗ |
xmm7 = _mm_load_si128((__m128i*)andmask1); |
86 |
|
|
|
87 |
|
|
__m128i *p_target, *p_src0; |
88 |
|
|
|
89 |
|
✗ |
p_target = (__m128i*)target; |
90 |
|
✗ |
p_src0 = (__m128i*)src0; |
91 |
|
|
|
92 |
|
✗ |
int bound = num_bytes >> 5; |
93 |
|
✗ |
int intermediate = (num_bytes >> 4) & 1; |
94 |
|
✗ |
int leftovers = (num_bytes >> 1) & 7; |
95 |
|
|
|
96 |
|
✗ |
int i = 0; |
97 |
|
|
|
98 |
|
✗ |
for (i = 0; i < bound; ++i) { |
99 |
|
✗ |
xmm0 = _mm_load_si128(p_src0); |
100 |
|
✗ |
xmm1 = _mm_load_si128(&p_src0[1]); |
101 |
|
|
|
102 |
|
✗ |
xmm2 = _mm_xor_si128(xmm2, xmm2); |
103 |
|
✗ |
p_src0 += 2; |
104 |
|
|
|
105 |
|
✗ |
xmm3 = _mm_hsub_epi16(xmm0, xmm1); |
106 |
|
|
|
107 |
|
✗ |
xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); |
108 |
|
|
|
109 |
|
✗ |
xmm8 = _mm_and_si128(xmm2, xmm6); |
110 |
|
✗ |
xmm3 = _mm_and_si128(xmm2, xmm7); |
111 |
|
|
|
112 |
|
|
|
113 |
|
✗ |
xmm8 = _mm_add_epi8(xmm8, xmm4); |
114 |
|
✗ |
xmm3 = _mm_add_epi8(xmm3, xmm5); |
115 |
|
|
|
116 |
|
✗ |
xmm0 = _mm_shuffle_epi8(xmm0, xmm8); |
117 |
|
✗ |
xmm1 = _mm_shuffle_epi8(xmm1, xmm3); |
118 |
|
|
|
119 |
|
|
|
120 |
|
✗ |
xmm3 = _mm_add_epi16(xmm0, xmm1); |
121 |
|
|
|
122 |
|
|
|
123 |
|
|
_mm_store_si128(p_target, xmm3); |
124 |
|
|
|
125 |
|
✗ |
p_target += 1; |
126 |
|
|
} |
127 |
|
|
|
128 |
|
✗ |
if (intermediate) { |
129 |
|
✗ |
xmm0 = _mm_load_si128(p_src0); |
130 |
|
|
|
131 |
|
✗ |
xmm2 = _mm_xor_si128(xmm2, xmm2); |
132 |
|
✗ |
p_src0 += 1; |
133 |
|
|
|
134 |
|
✗ |
xmm3 = _mm_hsub_epi16(xmm0, xmm1); |
135 |
|
✗ |
xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); |
136 |
|
|
|
137 |
|
✗ |
xmm8 = _mm_and_si128(xmm2, xmm6); |
138 |
|
|
|
139 |
|
✗ |
xmm3 = _mm_add_epi8(xmm8, xmm4); |
140 |
|
|
|
141 |
|
✗ |
xmm0 = _mm_shuffle_epi8(xmm0, xmm3); |
142 |
|
|
|
143 |
|
✗ |
_mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); |
144 |
|
|
|
145 |
|
✗ |
p_target = (__m128i*)((int8_t*)p_target + 8); |
146 |
|
|
} |
147 |
|
|
|
148 |
|
✗ |
for (i = (bound << 4) + (intermediate << 3); |
149 |
|
✗ |
i < (bound << 4) + (intermediate << 3) + leftovers; |
150 |
|
✗ |
i += 2) { |
151 |
|
✗ |
target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; |
152 |
|
|
} |
153 |
|
✗ |
} |
154 |
|
|
|
155 |
|
|
#endif /*LV_HAVE_SSSE3*/ |
156 |
|
|
|
157 |
|
|
#ifdef LV_HAVE_NEON |
158 |
|
|
|
159 |
|
|
#include <arm_neon.h> |
160 |
|
|
static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, |
161 |
|
|
int16_t* src0, |
162 |
|
|
unsigned int num_points) |
163 |
|
|
{ |
164 |
|
|
const unsigned int eighth_points = num_points / 16; |
165 |
|
|
unsigned number; |
166 |
|
|
int16x8x2_t input_vec; |
167 |
|
|
int16x8_t diff, max_vec, zeros; |
168 |
|
|
uint16x8_t comp1, comp2; |
169 |
|
|
zeros = vdupq_n_s16(0); |
170 |
|
|
for (number = 0; number < eighth_points; ++number) { |
171 |
|
|
input_vec = vld2q_s16(src0); |
172 |
|
|
//__VOLK_PREFETCH(src0+16); |
173 |
|
|
diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); |
174 |
|
|
comp1 = vcgeq_s16(diff, zeros); |
175 |
|
|
comp2 = vcltq_s16(diff, zeros); |
176 |
|
|
|
177 |
|
|
input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); |
178 |
|
|
input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); |
179 |
|
|
|
180 |
|
|
max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); |
181 |
|
|
vst1q_s16(target, max_vec); |
182 |
|
|
src0 += 16; |
183 |
|
|
target += 8; |
184 |
|
|
} |
185 |
|
|
for (number = 0; number < num_points % 16; number += 2) { |
186 |
|
|
target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) |
187 |
|
|
? src0[number] |
188 |
|
|
: src0[number + 1]; |
189 |
|
|
} |
190 |
|
|
} |
191 |
|
|
#endif /* LV_HAVE_NEON */ |
192 |
|
|
|
193 |
|
|
#ifdef LV_HAVE_NEONV7 |
194 |
|
|
extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, |
195 |
|
|
int16_t* src0, |
196 |
|
|
unsigned int num_points); |
197 |
|
|
#endif /* LV_HAVE_NEONV7 */ |
198 |
|
|
|
199 |
|
|
#ifdef LV_HAVE_GENERIC |
200 |
|
✗ |
static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, |
201 |
|
|
int16_t* src0, |
202 |
|
|
unsigned int num_points) |
203 |
|
|
{ |
204 |
|
✗ |
const unsigned int num_bytes = num_points * 2; |
205 |
|
|
|
206 |
|
✗ |
int i = 0; |
207 |
|
|
|
208 |
|
✗ |
int bound = num_bytes >> 1; |
209 |
|
|
|
210 |
|
✗ |
for (i = 0; i < bound; i += 2) { |
211 |
|
✗ |
target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; |
212 |
|
|
} |
213 |
|
✗ |
} |
214 |
|
|
|
215 |
|
|
#endif /*LV_HAVE_GENERIC*/ |
216 |
|
|
|
217 |
|
|
#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/ |
218 |
|
|
|