Line |
Branch |
Exec |
Source |
1 |
|
|
/* -*- c++ -*- */ |
2 |
|
|
/* |
3 |
|
|
* Copyright 2012, 2014 Free Software Foundation, Inc. |
4 |
|
|
* |
5 |
|
|
* This file is part of VOLK |
6 |
|
|
* |
7 |
|
|
* SPDX-License-Identifier: LGPL-3.0-or-later |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
/*! |
11 |
|
|
* \page volk_16i_max_star_16i |
12 |
|
|
* |
13 |
|
|
* \b Overview |
14 |
|
|
* |
15 |
|
|
* <FIXME> |
16 |
|
|
* |
17 |
|
|
* <b>Dispatcher Prototype</b> |
18 |
|
|
* \code |
19 |
|
|
* void volk_16i_max_star_16i(short* target, short* src0, unsigned int num_points); |
20 |
|
|
* \endcode |
21 |
|
|
* |
22 |
|
|
* \b Inputs |
23 |
|
|
* \li src0: The input vector. |
24 |
|
|
* \li num_points: The number of complex data points. |
25 |
|
|
* |
26 |
|
|
* \b Outputs |
27 |
|
|
* \li target: The output value of the max* operation. |
28 |
|
|
* |
29 |
|
|
* \b Example |
30 |
|
|
* \code |
31 |
|
|
* int N = 10000; |
32 |
|
|
* |
33 |
|
|
* volk_16i_max_star_16i(); |
34 |
|
|
* |
35 |
|
|
* volk_free(x); |
36 |
|
|
* volk_free(t); |
37 |
|
|
* \endcode |
38 |
|
|
*/ |
39 |
|
|
|
40 |
|
|
#ifndef INCLUDED_volk_16i_max_star_16i_a_H |
41 |
|
|
#define INCLUDED_volk_16i_max_star_16i_a_H |
42 |
|
|
|
43 |
|
|
#include <inttypes.h> |
44 |
|
|
#include <stdio.h> |
45 |
|
|
|
46 |
|
|
#ifdef LV_HAVE_SSSE3 |
47 |
|
|
|
48 |
|
|
#include <emmintrin.h> |
49 |
|
|
#include <tmmintrin.h> |
50 |
|
|
#include <xmmintrin.h> |
51 |
|
|
|
52 |
|
|
static inline void |
53 |
|
✗ |
volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) |
54 |
|
|
{ |
55 |
|
✗ |
const unsigned int num_bytes = num_points * 2; |
56 |
|
|
|
57 |
|
✗ |
short candidate = src0[0]; |
58 |
|
|
short cands[8]; |
59 |
|
|
__m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; |
60 |
|
|
|
61 |
|
|
__m128i* p_src0; |
62 |
|
|
|
63 |
|
✗ |
p_src0 = (__m128i*)src0; |
64 |
|
|
|
65 |
|
✗ |
int bound = num_bytes >> 4; |
66 |
|
✗ |
int leftovers = (num_bytes >> 1) & 7; |
67 |
|
|
|
68 |
|
✗ |
int i = 0; |
69 |
|
|
|
70 |
|
✗ |
xmm1 = _mm_setzero_si128(); |
71 |
|
✗ |
xmm0 = _mm_setzero_si128(); |
72 |
|
|
//_mm_insert_epi16(xmm0, candidate, 0); |
73 |
|
|
|
74 |
|
✗ |
xmm0 = _mm_shuffle_epi8(xmm0, xmm1); |
75 |
|
|
|
76 |
|
✗ |
for (i = 0; i < bound; ++i) { |
77 |
|
✗ |
xmm1 = _mm_load_si128(p_src0); |
78 |
|
✗ |
p_src0 += 1; |
79 |
|
|
// xmm2 = _mm_sub_epi16(xmm1, xmm0); |
80 |
|
|
|
81 |
|
✗ |
xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); |
82 |
|
✗ |
xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); |
83 |
|
✗ |
xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); |
84 |
|
|
|
85 |
|
✗ |
xmm6 = _mm_xor_si128(xmm4, xmm5); |
86 |
|
|
|
87 |
|
✗ |
xmm3 = _mm_and_si128(xmm3, xmm0); |
88 |
|
✗ |
xmm4 = _mm_and_si128(xmm6, xmm1); |
89 |
|
|
|
90 |
|
✗ |
xmm0 = _mm_add_epi16(xmm3, xmm4); |
91 |
|
|
} |
92 |
|
|
|
93 |
|
|
_mm_store_si128((__m128i*)cands, xmm0); |
94 |
|
|
|
95 |
|
✗ |
for (i = 0; i < 8; ++i) { |
96 |
|
✗ |
candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; |
97 |
|
|
} |
98 |
|
|
|
99 |
|
✗ |
for (i = 0; i < leftovers; ++i) { |
100 |
|
✗ |
candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) |
101 |
|
|
? candidate |
102 |
|
✗ |
: src0[(bound << 3) + i]; |
103 |
|
|
} |
104 |
|
|
|
105 |
|
✗ |
target[0] = candidate; |
106 |
|
✗ |
} |
107 |
|
|
|
108 |
|
|
#endif /*LV_HAVE_SSSE3*/ |
109 |
|
|
|
110 |
|
|
#ifdef LV_HAVE_NEON |
111 |
|
|
#include <arm_neon.h> |
112 |
|
|
|
113 |
|
|
static inline void |
114 |
|
|
volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) |
115 |
|
|
{ |
116 |
|
|
const unsigned int eighth_points = num_points / 8; |
117 |
|
|
unsigned number; |
118 |
|
|
int16x8_t input_vec; |
119 |
|
|
int16x8_t diff, zeros; |
120 |
|
|
uint16x8_t comp1, comp2; |
121 |
|
|
zeros = vdupq_n_s16(0); |
122 |
|
|
|
123 |
|
|
int16x8x2_t tmpvec; |
124 |
|
|
|
125 |
|
|
int16x8_t candidate_vec = vld1q_dup_s16(src0); |
126 |
|
|
short candidate; |
127 |
|
|
++src0; |
128 |
|
|
|
129 |
|
|
for (number = 0; number < eighth_points; ++number) { |
130 |
|
|
input_vec = vld1q_s16(src0); |
131 |
|
|
__VOLK_PREFETCH(src0 + 16); |
132 |
|
|
diff = vsubq_s16(candidate_vec, input_vec); |
133 |
|
|
comp1 = vcgeq_s16(diff, zeros); |
134 |
|
|
comp2 = vcltq_s16(diff, zeros); |
135 |
|
|
|
136 |
|
|
tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); |
137 |
|
|
tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); |
138 |
|
|
|
139 |
|
|
candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); |
140 |
|
|
src0 += 8; |
141 |
|
|
} |
142 |
|
|
vst1q_s16(&candidate, candidate_vec); |
143 |
|
|
|
144 |
|
|
for (number = 0; number < num_points % 8; number++) { |
145 |
|
|
candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; |
146 |
|
|
} |
147 |
|
|
target[0] = candidate; |
148 |
|
|
} |
149 |
|
|
#endif /*LV_HAVE_NEON*/ |
150 |
|
|
|
151 |
|
|
#ifdef LV_HAVE_GENERIC |
152 |
|
|
|
153 |
|
|
static inline void |
154 |
|
✗ |
volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) |
155 |
|
|
{ |
156 |
|
✗ |
const unsigned int num_bytes = num_points * 2; |
157 |
|
|
|
158 |
|
✗ |
int i = 0; |
159 |
|
|
|
160 |
|
✗ |
int bound = num_bytes >> 1; |
161 |
|
|
|
162 |
|
✗ |
short candidate = src0[0]; |
163 |
|
✗ |
for (i = 1; i < bound; ++i) { |
164 |
|
✗ |
candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; |
165 |
|
|
} |
166 |
|
✗ |
target[0] = candidate; |
167 |
|
✗ |
} |
168 |
|
|
|
169 |
|
|
#endif /*LV_HAVE_GENERIC*/ |
170 |
|
|
|
171 |
|
|
|
172 |
|
|
#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/ |
173 |
|
|
|