GCC Code Coverage Report


Directory: ./
File: kernels/volk/volk_16i_x5_add_quad_16i_x4.h
Date: 2023-10-23 23:10:04
Exec Total Coverage
Lines: 0 49 0.0%
Functions: 0 2 0.0%
Branches: 0 6 0.0%

Line Branch Exec Source
1 /* -*- c++ -*- */
2 /*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10 /*!
11 * \page volk_16i_x5_add_quad_16i_x4
12 *
13 * \b Overview
14 *
15 * <FIXME>
16 *
17 * <b>Dispatcher Prototype</b>
18 * \code
19 * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short*
20 * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int
21 * num_points); \endcode
22 *
23 * \b Inputs
24 * \li src0: The input vector 0.
25 * \li src1: The input vector 1.
26 * \li src2: The input vector 2.
27 * \li src3: The input vector 3.
28 * \li src4: The input vector 4.
29 * \li num_points: The number of data points.
30 *
31 * \b Outputs
32 * \li target0: The output value 0.
33 * \li target1: The output value 1.
34 * \li target2: The output value 2.
35 * \li target3: The output value 3.
36 *
37 * \b Example
38 * \code
39 * int N = 10000;
40 *
41 * volk_16i_x5_add_quad_16i_x4();
42 *
43 * volk_free(x);
44 * \endcode
45 */
46
47 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
48 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
49
50 #include <inttypes.h>
51 #include <stdio.h>
52
53 #ifdef LV_HAVE_SSE2
54 #include <emmintrin.h>
55 #include <xmmintrin.h>
56
57 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
58 short* target1,
59 short* target2,
60 short* target3,
61 short* src0,
62 short* src1,
63 short* src2,
64 short* src3,
65 short* src4,
66 unsigned int num_points)
67 {
68 const unsigned int num_bytes = num_points * 2;
69
70 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
71 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
72 *p_src3, *p_src4;
73 p_target0 = (__m128i*)target0;
74 p_target1 = (__m128i*)target1;
75 p_target2 = (__m128i*)target2;
76 p_target3 = (__m128i*)target3;
77
78 p_src0 = (__m128i*)src0;
79 p_src1 = (__m128i*)src1;
80 p_src2 = (__m128i*)src2;
81 p_src3 = (__m128i*)src3;
82 p_src4 = (__m128i*)src4;
83
84 int i = 0;
85
86 int bound = (num_bytes >> 4);
87 int leftovers = (num_bytes >> 1) & 7;
88
89 for (; i < bound; ++i) {
90 xmm0 = _mm_load_si128(p_src0);
91 xmm1 = _mm_load_si128(p_src1);
92 xmm2 = _mm_load_si128(p_src2);
93 xmm3 = _mm_load_si128(p_src3);
94 xmm4 = _mm_load_si128(p_src4);
95
96 p_src0 += 1;
97 p_src1 += 1;
98
99 xmm1 = _mm_add_epi16(xmm0, xmm1);
100 xmm2 = _mm_add_epi16(xmm0, xmm2);
101 xmm3 = _mm_add_epi16(xmm0, xmm3);
102 xmm4 = _mm_add_epi16(xmm0, xmm4);
103
104
105 p_src2 += 1;
106 p_src3 += 1;
107 p_src4 += 1;
108
109 _mm_store_si128(p_target0, xmm1);
110 _mm_store_si128(p_target1, xmm2);
111 _mm_store_si128(p_target2, xmm3);
112 _mm_store_si128(p_target3, xmm4);
113
114 p_target0 += 1;
115 p_target1 += 1;
116 p_target2 += 1;
117 p_target3 += 1;
118 }
119 /*__VOLK_ASM __VOLK_VOLATILE
120 (
121 ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
122 "cmp $0, %[bound]\n\t"
123 "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
124 "movaps (%[src0]), %%xmm1\n\t"
125 "movaps (%[src1]), %%xmm2\n\t"
126 "movaps (%[src2]), %%xmm3\n\t"
127 "movaps (%[src3]), %%xmm4\n\t"
128 "movaps (%[src4]), %%xmm5\n\t"
129 "add $16, %[src0]\n\t"
130 "add $16, %[src1]\n\t"
131 "add $16, %[src2]\n\t"
132 "add $16, %[src3]\n\t"
133 "add $16, %[src4]\n\t"
134 "paddw %%xmm1, %%xmm2\n\t"
135 "paddw %%xmm1, %%xmm3\n\t"
136 "paddw %%xmm1, %%xmm4\n\t"
137 "paddw %%xmm1, %%xmm5\n\t"
138 "add $-1, %[bound]\n\t"
139 "movaps %%xmm2, (%[target0])\n\t"
140 "movaps %%xmm3, (%[target1])\n\t"
141 "movaps %%xmm4, (%[target2])\n\t"
142 "movaps %%xmm5, (%[target3])\n\t"
143 "add $16, %[target0]\n\t"
144 "add $16, %[target1]\n\t"
145 "add $16, %[target2]\n\t"
146 "add $16, %[target3]\n\t"
147 "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
148 ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
149 :
150 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
151 [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
152 [target2]"r"(target2), [target3]"r"(target3)
153 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
154 );
155 */
156
157 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
158 target0[i] = src0[i] + src1[i];
159 target1[i] = src0[i] + src2[i];
160 target2[i] = src0[i] + src3[i];
161 target3[i] = src0[i] + src4[i];
162 }
163 }
164 #endif /*LV_HAVE_SSE2*/
165
166 #ifdef LV_HAVE_NEON
167 #include <arm_neon.h>
168
169 static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
170 short* target1,
171 short* target2,
172 short* target3,
173 short* src0,
174 short* src1,
175 short* src2,
176 short* src3,
177 short* src4,
178 unsigned int num_points)
179 {
180 const unsigned int eighth_points = num_points / 8;
181 unsigned int number = 0;
182
183 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
184 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
185 for (number = 0; number < eighth_points; ++number) {
186 src0_vec = vld1q_s16(src0);
187 src1_vec = vld1q_s16(src1);
188 src2_vec = vld1q_s16(src2);
189 src3_vec = vld1q_s16(src3);
190 src4_vec = vld1q_s16(src4);
191
192 target0_vec = vaddq_s16(src0_vec, src1_vec);
193 target1_vec = vaddq_s16(src0_vec, src2_vec);
194 target2_vec = vaddq_s16(src0_vec, src3_vec);
195 target3_vec = vaddq_s16(src0_vec, src4_vec);
196
197 vst1q_s16(target0, target0_vec);
198 vst1q_s16(target1, target1_vec);
199 vst1q_s16(target2, target2_vec);
200 vst1q_s16(target3, target3_vec);
201 src0 += 8;
202 src1 += 8;
203 src2 += 8;
204 src3 += 8;
205 src4 += 8;
206 target0 += 8;
207 target1 += 8;
208 target2 += 8;
209 target3 += 8;
210 }
211
212 for (number = eighth_points * 8; number < num_points; ++number) {
213 *target0++ = *src0 + *src1++;
214 *target1++ = *src0 + *src2++;
215 *target2++ = *src0 + *src3++;
216 *target3++ = *src0++ + *src4++;
217 }
218 }
219
220 #endif /* LV_HAVE_NEON */
221
222 #ifdef LV_HAVE_GENERIC
223
224 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
225 short* target1,
226 short* target2,
227 short* target3,
228 short* src0,
229 short* src1,
230 short* src2,
231 short* src3,
232 short* src4,
233 unsigned int num_points)
234 {
235 const unsigned int num_bytes = num_points * 2;
236
237 int i = 0;
238
239 int bound = num_bytes >> 1;
240
241 for (i = 0; i < bound; ++i) {
242 target0[i] = src0[i] + src1[i];
243 target1[i] = src0[i] + src2[i];
244 target2[i] = src0[i] + src3[i];
245 target3[i] = src0[i] + src4[i];
246 }
247 }
248
249 #endif /* LV_HAVE_GENERIC */
250
251 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
252