GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16i_max_star_horizontal_16i.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 0 52 0.0%
Functions: 0 2 0.0%
Branches: 0 12 0.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11 /*!
12 * \page volk_16i_max_star_horizontal_16i
13 *
14 * \b Overview
15 *
16 * <FIXME>
17 *
18 * <b>Dispatcher Prototype</b>
19 * \code
20 * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int
21 * num_points); \endcode
22 *
23 * \b Inputs
24 * \li src0: The input vector.
25 * \li num_points: The number of complex data points.
26 *
27 * \b Outputs
28 * \li target: The output value of the max* operation.
29 *
30 * \b Example
31 * \code
32 * int N = 10000;
33 *
34 * volk_16i_max_star_horizontal_16i();
35 *
36 * volk_free(x);
37 * volk_free(t);
38 * \endcode
39 */
40
41 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
42 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
43
44 #include <volk/volk_common.h>
45
46 #include <inttypes.h>
47 #include <stdio.h>
48
49
50 #ifdef LV_HAVE_SSSE3
51
52 #include <emmintrin.h>
53 #include <tmmintrin.h>
54 #include <xmmintrin.h>
55
56 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
57 int16_t* src0,
58 unsigned int num_points)
59 {
60 const unsigned int num_bytes = num_points * 2;
61
62 static const uint8_t shufmask0[16] = {
63 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
64 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
65 };
66 static const uint8_t shufmask1[16] = {
67 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
69 };
70 static const uint8_t andmask0[16] = {
71 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
73 };
74 static const uint8_t andmask1[16] = {
75 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
77 };
78
79 __m128i xmm0 = {}, xmm1 = {}, xmm2 = {}, xmm3 = {}, xmm4 = {};
80 __m128i xmm5 = {}, xmm6 = {}, xmm7 = {}, xmm8 = {};
81
82 xmm4 = _mm_load_si128((__m128i*)shufmask0);
83 xmm5 = _mm_load_si128((__m128i*)shufmask1);
84 xmm6 = _mm_load_si128((__m128i*)andmask0);
85 xmm7 = _mm_load_si128((__m128i*)andmask1);
86
87 __m128i *p_target, *p_src0;
88
89 p_target = (__m128i*)target;
90 p_src0 = (__m128i*)src0;
91
92 int bound = num_bytes >> 5;
93 int intermediate = (num_bytes >> 4) & 1;
94 int leftovers = (num_bytes >> 1) & 7;
95
96 int i = 0;
97
98 for (i = 0; i < bound; ++i) {
99 xmm0 = _mm_load_si128(p_src0);
100 xmm1 = _mm_load_si128(&p_src0[1]);
101
102 xmm2 = _mm_xor_si128(xmm2, xmm2);
103 p_src0 += 2;
104
105 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
106
107 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
108
109 xmm8 = _mm_and_si128(xmm2, xmm6);
110 xmm3 = _mm_and_si128(xmm2, xmm7);
111
112
113 xmm8 = _mm_add_epi8(xmm8, xmm4);
114 xmm3 = _mm_add_epi8(xmm3, xmm5);
115
116 xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
117 xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
118
119
120 xmm3 = _mm_add_epi16(xmm0, xmm1);
121
122
123 _mm_store_si128(p_target, xmm3);
124
125 p_target += 1;
126 }
127
128 if (intermediate) {
129 xmm0 = _mm_load_si128(p_src0);
130
131 xmm2 = _mm_xor_si128(xmm2, xmm2);
132 p_src0 += 1;
133
134 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
135 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
136
137 xmm8 = _mm_and_si128(xmm2, xmm6);
138
139 xmm3 = _mm_add_epi8(xmm8, xmm4);
140
141 xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
142
143 _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
144
145 p_target = (__m128i*)((int8_t*)p_target + 8);
146 }
147
148 for (i = (bound << 4) + (intermediate << 3);
149 i < (bound << 4) + (intermediate << 3) + leftovers;
150 i += 2) {
151 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
152 }
153 }
154
155 #endif /*LV_HAVE_SSSE3*/
156
157 #ifdef LV_HAVE_NEON
158
159 #include <arm_neon.h>
160 static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
161 int16_t* src0,
162 unsigned int num_points)
163 {
164 const unsigned int eighth_points = num_points / 16;
165 unsigned number;
166 int16x8x2_t input_vec;
167 int16x8_t diff, max_vec, zeros;
168 uint16x8_t comp1, comp2;
169 zeros = vdupq_n_s16(0);
170 for (number = 0; number < eighth_points; ++number) {
171 input_vec = vld2q_s16(src0);
172 //__VOLK_PREFETCH(src0+16);
173 diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
174 comp1 = vcgeq_s16(diff, zeros);
175 comp2 = vcltq_s16(diff, zeros);
176
177 input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
178 input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
179
180 max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
181 vst1q_s16(target, max_vec);
182 src0 += 16;
183 target += 8;
184 }
185 for (number = 0; number < num_points % 16; number += 2) {
186 target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
187 ? src0[number]
188 : src0[number + 1];
189 }
190 }
191 #endif /* LV_HAVE_NEON */
192
193 #ifdef LV_HAVE_NEONV7
194 extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
195 int16_t* src0,
196 unsigned int num_points);
197 #endif /* LV_HAVE_NEONV7 */
198
199 #ifdef LV_HAVE_GENERIC
200 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
201 int16_t* src0,
202 unsigned int num_points)
203 {
204 const unsigned int num_bytes = num_points * 2;
205
206 int i = 0;
207
208 int bound = num_bytes >> 1;
209
210 for (i = 0; i < bound; i += 2) {
211 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
212 }
213 }
214
215 #endif /*LV_HAVE_GENERIC*/
216
217 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
218