Line | Branch | Exec | Source |
---|---|---|---|
1 | /* -*- c++ -*- */ | ||
2 | /* | ||
3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
4 | * | ||
5 | * This file is part of VOLK | ||
6 | * | ||
7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
8 | */ | ||
9 | |||
10 | /*! | ||
11 | * \page volk_32fc_x2_conjugate_dot_prod_32fc | ||
12 | * | ||
13 | * \b Overview | ||
14 | * | ||
15 | * This block computes the conjugate dot product (or inner product) | ||
16 | * between two vectors, the \p input and \p taps vectors. Given a set | ||
17 | * of \p num_points taps, the result is the sum of products between | ||
18 | * the input vector and the conjugate of the taps. The result is a | ||
19 | * single value stored in the \p result address and is returned as a | ||
20 | * complex float. | ||
21 | * | ||
22 | * <b>Dispatcher Prototype</b> | ||
23 | * \code | ||
24 | * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, | ||
25 | * const lv_32fc_t* taps, unsigned int num_points) \endcode | ||
26 | * | ||
27 | * \b Inputs | ||
28 | * \li input: vector of complex floats. | ||
29 | * \li taps: complex float taps. | ||
30 | * \li num_points: number of samples in both \p input and \p taps. | ||
31 | * | ||
32 | * \b Outputs | ||
33 | * \li result: pointer to a complex float value to hold the dot product result. | ||
34 | * | ||
35 | * \b Example | ||
36 | * \code | ||
37 | * unsigned int N = 1000; | ||
38 | * unsigned int alignment = volk_get_alignment(); | ||
39 | * | ||
40 | * lv_32fc_t* a = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment); | ||
41 | * lv_32fc_t* b = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment); | ||
42 | * | ||
43 | * for (int i = 0; i < N; ++i) { | ||
44 | * a[i] = lv_cmake(.50f, .50f); | ||
45 | * b[i] = lv_cmake(.50f, .75f); | ||
46 | * } | ||
47 | * | ||
48 | * lv_32fc_t e = (float) N * a[0] * lv_conj(b[0]); // When a and b constant | ||
49 | * lv_32fc_t res; | ||
50 | * | ||
51 | * volk_32fc_x2_conjugate_dot_prod_32fc(&res, a, b, N); | ||
52 | * | ||
53 | * printf("Expected: %8.2f%+8.2fi\n", lv_real(e), lv_imag(e)); | ||
54 | * printf("Result: %8.2f%+8.2fi\n", lv_real(res), lv_imag(res)); | ||
55 | * | ||
56 | * volk_free(a); | ||
57 | * volk_free(b); | ||
58 | * \endcode | ||
59 | */ | ||
60 | |||
61 | #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H | ||
62 | #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H | ||
63 | |||
64 | |||
65 | #include <volk/volk_complex.h> | ||
66 | |||
67 | |||
68 | #ifdef LV_HAVE_GENERIC | ||
69 | |||
70 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, | |
71 | const lv_32fc_t* input, | ||
72 | const lv_32fc_t* taps, | ||
73 | unsigned int num_points) | ||
74 | { | ||
75 | 2 | lv_32fc_t res = lv_cmake(0.f, 0.f); | |
76 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (unsigned int i = 0; i < num_points; ++i) { |
77 | 262142 | res += (*input++) * lv_conj((*taps++)); | |
78 | } | ||
79 | 2 | *result = res; | |
80 | 2 | } | |
81 | |||
82 | #endif /*LV_HAVE_GENERIC*/ | ||
83 | |||
84 | #ifdef LV_HAVE_GENERIC | ||
85 | |||
86 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_block(lv_32fc_t* result, | |
87 | const lv_32fc_t* input, | ||
88 | const lv_32fc_t* taps, | ||
89 | unsigned int num_points) | ||
90 | { | ||
91 | |||
92 | 2 | const unsigned int num_bytes = num_points * 8; | |
93 | |||
94 | 2 | float* res = (float*)result; | |
95 | 2 | float* in = (float*)input; | |
96 | 2 | float* tp = (float*)taps; | |
97 | 2 | unsigned int n_2_ccomplex_blocks = num_bytes >> 4; | |
98 | |||
99 | 2 | float sum0[2] = { 0, 0 }; | |
100 | 2 | float sum1[2] = { 0, 0 }; | |
101 | 2 | unsigned int i = 0; | |
102 | |||
103 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
104 | 131070 | sum0[0] += in[0] * tp[0] + in[1] * tp[1]; | |
105 | 131070 | sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; | |
106 | 131070 | sum1[0] += in[2] * tp[2] + in[3] * tp[3]; | |
107 | 131070 | sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; | |
108 | |||
109 | 131070 | in += 4; | |
110 | 131070 | tp += 4; | |
111 | } | ||
112 | |||
113 | 2 | res[0] = sum0[0] + sum1[0]; | |
114 | 2 | res[1] = sum0[1] + sum1[1]; | |
115 | |||
116 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 3 & 1) { |
117 | 2 | *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); | |
118 | } | ||
119 | 2 | } | |
120 | |||
121 | #endif /*LV_HAVE_GENERIC*/ | ||
122 | |||
123 | #ifdef LV_HAVE_AVX | ||
124 | |||
125 | #include <immintrin.h> | ||
126 | |||
127 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result, | |
128 | const lv_32fc_t* input, | ||
129 | const lv_32fc_t* taps, | ||
130 | unsigned int num_points) | ||
131 | { | ||
132 | // Partial sums for indices i, i+1, i+2 and i+3. | ||
133 | 2 | __m256 sum_a_mult_b_real = _mm256_setzero_ps(); | |
134 | 2 | __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); | |
135 | |||
136 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { |
137 | /* Four complex elements a time are processed. | ||
138 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
139 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
140 | */ | ||
141 | |||
142 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
143 | * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
144 | * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
145 | * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
146 | * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
147 | */ | ||
148 | 65534 | __m256 a = _mm256_loadu_ps((const float*)&input[i]); | |
149 | 131068 | __m256 b = _mm256_loadu_ps((const float*)&taps[i]); | |
150 | 65534 | __m256 b_real = _mm256_moveldup_ps(b); | |
151 | 65534 | __m256 b_imag = _mm256_movehdup_ps(b); | |
152 | |||
153 | // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
154 | 131068 | sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); | |
155 | // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
156 | 131068 | sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); | |
157 | } | ||
158 | |||
159 | // Swap position of −ar⋅bi and ai⋅bi. | ||
160 | 2 | sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
161 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. | ||
162 | 2 | __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
163 | /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. | ||
164 | * s1 + s3 and s0 + s2 … | ||
165 | */ | ||
166 | 2 | sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); | |
167 | // … and now (s0 + s2) + (s1 + s3) | ||
168 | 2 | sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
169 | // Store result. | ||
170 | 2 | __m128 lower = _mm256_extractf128_ps(sum, 0); | |
171 | _mm_storel_pi((__m64*)result, lower); | ||
172 | |||
173 | // Handle the last elements if num_points mod 4 is bigger than 0. | ||
174 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (long unsigned i = num_points & ~3u; i < num_points; ++i) { |
175 | 6 | *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + | |
176 | lv_cimag(input[i]) * lv_cimag(taps[i]), | ||
177 | lv_cimag(input[i]) * lv_creal(taps[i]) - | ||
178 | lv_creal(input[i]) * lv_cimag(taps[i])); | ||
179 | } | ||
180 | 2 | } | |
181 | |||
182 | #endif /* LV_HAVE_AVX */ | ||
183 | |||
184 | #ifdef LV_HAVE_SSE3 | ||
185 | |||
186 | #include <pmmintrin.h> | ||
187 | #include <xmmintrin.h> | ||
188 | |||
189 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, | |
190 | const lv_32fc_t* input, | ||
191 | const lv_32fc_t* taps, | ||
192 | unsigned int num_points) | ||
193 | { | ||
194 | // Partial sums for indices i and i+1. | ||
195 | 2 | __m128 sum_a_mult_b_real = _mm_setzero_ps(); | |
196 | 2 | __m128 sum_a_mult_b_imag = _mm_setzero_ps(); | |
197 | |||
198 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { |
199 | /* Two complex elements a time are processed. | ||
200 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
201 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
202 | */ | ||
203 | |||
204 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
205 | * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
206 | * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
207 | * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
208 | * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
209 | */ | ||
210 | 131070 | __m128 a = _mm_loadu_ps((const float*)&input[i]); | |
211 | 262140 | __m128 b = _mm_loadu_ps((const float*)&taps[i]); | |
212 | 131070 | __m128 b_real = _mm_moveldup_ps(b); | |
213 | 131070 | __m128 b_imag = _mm_movehdup_ps(b); | |
214 | |||
215 | // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
216 | 262140 | sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); | |
217 | // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
218 | 262140 | sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); | |
219 | } | ||
220 | |||
221 | // Swap position of −ar⋅bi and ai⋅bi. | ||
222 | sum_a_mult_b_imag = | ||
223 | 2 | _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
224 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. | ||
225 | 2 | __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
226 | // Sum the two partial sums. | ||
227 | 4 | sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
228 | // Store result. | ||
229 | _mm_storel_pi((__m64*)result, sum); | ||
230 | |||
231 | // Handle the last element if num_points mod 2 is 1. | ||
232 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1u) { |
233 | 2 | *result += lv_cmake( | |
234 | lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + | ||
235 | lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), | ||
236 | lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - | ||
237 | lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); | ||
238 | } | ||
239 | 2 | } | |
240 | |||
241 | #endif /*LV_HAVE_SSE3*/ | ||
242 | |||
243 | #ifdef LV_HAVE_NEON | ||
244 | #include <arm_neon.h> | ||
245 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, | ||
246 | const lv_32fc_t* input, | ||
247 | const lv_32fc_t* taps, | ||
248 | unsigned int num_points) | ||
249 | { | ||
250 | |||
251 | unsigned int quarter_points = num_points / 4; | ||
252 | unsigned int number; | ||
253 | |||
254 | lv_32fc_t* a_ptr = (lv_32fc_t*)taps; | ||
255 | lv_32fc_t* b_ptr = (lv_32fc_t*)input; | ||
256 | // for 2-lane vectors, 1st lane holds the real part, | ||
257 | // 2nd lane holds the imaginary part | ||
258 | float32x4x2_t a_val, b_val, accumulator; | ||
259 | float32x4x2_t tmp_imag; | ||
260 | accumulator.val[0] = vdupq_n_f32(0); | ||
261 | accumulator.val[1] = vdupq_n_f32(0); | ||
262 | |||
263 | for (number = 0; number < quarter_points; ++number) { | ||
264 | a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i | ||
265 | b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i | ||
266 | __VOLK_PREFETCH(a_ptr + 8); | ||
267 | __VOLK_PREFETCH(b_ptr + 8); | ||
268 | |||
269 | // do the first multiply | ||
270 | tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); | ||
271 | tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); | ||
272 | |||
273 | // use multiply accumulate/subtract to get result | ||
274 | tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); | ||
275 | tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); | ||
276 | |||
277 | accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]); | ||
278 | accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]); | ||
279 | |||
280 | // increment pointers | ||
281 | a_ptr += 4; | ||
282 | b_ptr += 4; | ||
283 | } | ||
284 | lv_32fc_t accum_result[4]; | ||
285 | vst2q_f32((float*)accum_result, accumulator); | ||
286 | *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; | ||
287 | |||
288 | // tail case | ||
289 | for (number = quarter_points * 4; number < num_points; ++number) { | ||
290 | *result += (*a_ptr++) * lv_conj(*b_ptr++); | ||
291 | } | ||
292 | *result = lv_conj(*result); | ||
293 | } | ||
294 | #endif /*LV_HAVE_NEON*/ | ||
295 | |||
296 | #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ | ||
297 | |||
298 | #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H | ||
299 | #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H | ||
300 | |||
301 | #include <stdio.h> | ||
302 | #include <volk/volk_common.h> | ||
303 | #include <volk/volk_complex.h> | ||
304 | |||
305 | |||
306 | #ifdef LV_HAVE_AVX | ||
307 | #include <immintrin.h> | ||
308 | |||
309 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result, | |
310 | const lv_32fc_t* input, | ||
311 | const lv_32fc_t* taps, | ||
312 | unsigned int num_points) | ||
313 | { | ||
314 | // Partial sums for indices i, i+1, i+2 and i+3. | ||
315 | 2 | __m256 sum_a_mult_b_real = _mm256_setzero_ps(); | |
316 | 2 | __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); | |
317 | |||
318 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { |
319 | /* Four complex elements a time are processed. | ||
320 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
321 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
322 | */ | ||
323 | |||
324 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
325 | * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
326 | * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
327 | * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
328 | * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
329 | */ | ||
330 | 65534 | __m256 a = _mm256_load_ps((const float*)&input[i]); | |
331 | 131068 | __m256 b = _mm256_load_ps((const float*)&taps[i]); | |
332 | 65534 | __m256 b_real = _mm256_moveldup_ps(b); | |
333 | 65534 | __m256 b_imag = _mm256_movehdup_ps(b); | |
334 | |||
335 | // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
336 | 131068 | sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); | |
337 | // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
338 | 131068 | sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); | |
339 | } | ||
340 | |||
341 | // Swap position of −ar⋅bi and ai⋅bi. | ||
342 | 2 | sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
343 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. | ||
344 | 2 | __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
345 | /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. | ||
346 | * s1 + s3 and s0 + s2 … | ||
347 | */ | ||
348 | 2 | sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); | |
349 | // … and now (s0 + s2) + (s1 + s3) | ||
350 | 2 | sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
351 | // Store result. | ||
352 | 2 | __m128 lower = _mm256_extractf128_ps(sum, 0); | |
353 | _mm_storel_pi((__m64*)result, lower); | ||
354 | |||
355 | // Handle the last elements if num_points mod 4 is bigger than 0. | ||
356 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (long unsigned i = num_points & ~3u; i < num_points; ++i) { |
357 | 6 | *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + | |
358 | lv_cimag(input[i]) * lv_cimag(taps[i]), | ||
359 | lv_cimag(input[i]) * lv_creal(taps[i]) - | ||
360 | lv_creal(input[i]) * lv_cimag(taps[i])); | ||
361 | } | ||
362 | 2 | } | |
363 | #endif /* LV_HAVE_AVX */ | ||
364 | |||
365 | #ifdef LV_HAVE_SSE3 | ||
366 | |||
367 | #include <pmmintrin.h> | ||
368 | #include <xmmintrin.h> | ||
369 | |||
370 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result, | |
371 | const lv_32fc_t* input, | ||
372 | const lv_32fc_t* taps, | ||
373 | unsigned int num_points) | ||
374 | { | ||
375 | // Partial sums for indices i and i+1. | ||
376 | 2 | __m128 sum_a_mult_b_real = _mm_setzero_ps(); | |
377 | 2 | __m128 sum_a_mult_b_imag = _mm_setzero_ps(); | |
378 | |||
379 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { |
380 | /* Two complex elements a time are processed. | ||
381 | * (ar + j⋅ai)*conj(br + j⋅bi) = | ||
382 | * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) | ||
383 | */ | ||
384 | |||
385 | /* Load input and taps, split and duplicate real und imaginary parts of taps. | ||
386 | * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | | ||
387 | * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | | ||
388 | * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | | ||
389 | * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | | ||
390 | */ | ||
391 | 131070 | __m128 a = _mm_load_ps((const float*)&input[i]); | |
392 | 262140 | __m128 b = _mm_load_ps((const float*)&taps[i]); | |
393 | 131070 | __m128 b_real = _mm_moveldup_ps(b); | |
394 | 131070 | __m128 b_imag = _mm_movehdup_ps(b); | |
395 | |||
396 | // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. | ||
397 | 262140 | sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); | |
398 | // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. | ||
399 | 262140 | sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); | |
400 | } | ||
401 | |||
402 | // Swap position of −ar⋅bi and ai⋅bi. | ||
403 | sum_a_mult_b_imag = | ||
404 | 2 | _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); | |
405 | // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. | ||
406 | 2 | __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); | |
407 | // Sum the two partial sums. | ||
408 | 4 | sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); | |
409 | // Store result. | ||
410 | _mm_storel_pi((__m64*)result, sum); | ||
411 | |||
412 | // Handle the last element if num_points mod 2 is 1. | ||
413 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_points & 1u) { |
414 | 2 | *result += lv_cmake( | |
415 | lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + | ||
416 | lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), | ||
417 | lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - | ||
418 | lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); | ||
419 | } | ||
420 | 2 | } | |
421 | |||
422 | #endif /*LV_HAVE_SSE3*/ | ||
423 | |||
424 | |||
425 | #ifdef LV_HAVE_GENERIC | ||
426 | |||
427 | |||
428 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, | |
429 | const lv_32fc_t* input, | ||
430 | const lv_32fc_t* taps, | ||
431 | unsigned int num_points) | ||
432 | { | ||
433 | |||
434 | 2 | const unsigned int num_bytes = num_points * 8; | |
435 | |||
436 | 2 | float* res = (float*)result; | |
437 | 2 | float* in = (float*)input; | |
438 | 2 | float* tp = (float*)taps; | |
439 | 2 | unsigned int n_2_ccomplex_blocks = num_bytes >> 4; | |
440 | |||
441 | 2 | float sum0[2] = { 0, 0 }; | |
442 | 2 | float sum1[2] = { 0, 0 }; | |
443 | 2 | unsigned int i = 0; | |
444 | |||
445 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (i = 0; i < n_2_ccomplex_blocks; ++i) { |
446 | 131070 | sum0[0] += in[0] * tp[0] + in[1] * tp[1]; | |
447 | 131070 | sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; | |
448 | 131070 | sum1[0] += in[2] * tp[2] + in[3] * tp[3]; | |
449 | 131070 | sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; | |
450 | |||
451 | 131070 | in += 4; | |
452 | 131070 | tp += 4; | |
453 | } | ||
454 | |||
455 | 2 | res[0] = sum0[0] + sum1[0]; | |
456 | 2 | res[1] = sum0[1] + sum1[1]; | |
457 | |||
458 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (num_bytes >> 3 & 1) { |
459 | 2 | *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); | |
460 | } | ||
461 | 2 | } | |
462 | |||
463 | #endif /*LV_HAVE_GENERIC*/ | ||
464 | |||
465 | |||
466 | #if LV_HAVE_SSE && LV_HAVE_64 | ||
467 | |||
468 | 2 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, | |
469 | const lv_32fc_t* input, | ||
470 | const lv_32fc_t* taps, | ||
471 | unsigned int num_points) | ||
472 | { | ||
473 | |||
474 | 2 | const unsigned int num_bytes = num_points * 8; | |
475 | |||
476 | __VOLK_ATTR_ALIGNED(16) | ||
477 | static const uint32_t conjugator[4] = { | ||
478 | 0x00000000, 0x80000000, 0x00000000, 0x80000000 | ||
479 | }; | ||
480 | |||
481 | 2 | __VOLK_ASM __VOLK_VOLATILE( | |
482 | "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" | ||
483 | "# const float *taps, unsigned num_bytes)\n\t" | ||
484 | "# float sum0 = 0;\n\t" | ||
485 | "# float sum1 = 0;\n\t" | ||
486 | "# float sum2 = 0;\n\t" | ||
487 | "# float sum3 = 0;\n\t" | ||
488 | "# do {\n\t" | ||
489 | "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" | ||
490 | "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" | ||
491 | "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" | ||
492 | "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" | ||
493 | "# input += 4;\n\t" | ||
494 | "# taps += 4; \n\t" | ||
495 | "# } while (--n_2_ccomplex_blocks != 0);\n\t" | ||
496 | "# result[0] = sum0 + sum2;\n\t" | ||
497 | "# result[1] = sum1 + sum3;\n\t" | ||
498 | "# TODO: prefetch and better scheduling\n\t" | ||
499 | " xor %%r9, %%r9\n\t" | ||
500 | " xor %%r10, %%r10\n\t" | ||
501 | " movq %[conjugator], %%r9\n\t" | ||
502 | " movq %%rcx, %%rax\n\t" | ||
503 | " movaps 0(%%r9), %%xmm8\n\t" | ||
504 | " movq %%rcx, %%r8\n\t" | ||
505 | " movq %[rsi], %%r9\n\t" | ||
506 | " movq %[rdx], %%r10\n\t" | ||
507 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
508 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
509 | " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" | ||
510 | " shr $4, %%r8\n\t" | ||
511 | " xorps %%xmm8, %%xmm2\n\t" | ||
512 | " jmp .%=L1_test\n\t" | ||
513 | " # 4 taps / loop\n\t" | ||
514 | " # something like ?? cycles / loop\n\t" | ||
515 | ".%=Loop1: \n\t" | ||
516 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
517 | "# movaps (%%r9), %%xmmA\n\t" | ||
518 | "# movaps (%%r10), %%xmmB\n\t" | ||
519 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
520 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
521 | "# mulps %%xmmB, %%xmmA\n\t" | ||
522 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
523 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
524 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
525 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
526 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
527 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
528 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
529 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
530 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
531 | "# addps %%xmmZ, %%xmmA\n\t" | ||
532 | "# addps %%xmmA, %%xmmC\n\t" | ||
533 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
534 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
535 | " movaps 0(%%r9), %%xmm0\n\t" | ||
536 | " movaps 16(%%r9), %%xmm1\n\t" | ||
537 | " movaps %%xmm0, %%xmm4\n\t" | ||
538 | " movaps 0(%%r10), %%xmm2\n\t" | ||
539 | " xorps %%xmm8, %%xmm2\n\t" | ||
540 | " mulps %%xmm2, %%xmm0\n\t" | ||
541 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
542 | " movaps 16(%%r10), %%xmm3\n\t" | ||
543 | " movaps %%xmm1, %%xmm5\n\t" | ||
544 | " xorps %%xmm8, %%xmm3\n\t" | ||
545 | " addps %%xmm0, %%xmm6\n\t" | ||
546 | " mulps %%xmm3, %%xmm1\n\t" | ||
547 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
548 | " addps %%xmm1, %%xmm6\n\t" | ||
549 | " mulps %%xmm4, %%xmm2\n\t" | ||
550 | " addps %%xmm2, %%xmm7\n\t" | ||
551 | " mulps %%xmm5, %%xmm3\n\t" | ||
552 | " add $32, %%r9\n\t" | ||
553 | " addps %%xmm3, %%xmm7\n\t" | ||
554 | " add $32, %%r10\n\t" | ||
555 | ".%=L1_test:\n\t" | ||
556 | " dec %%rax\n\t" | ||
557 | " jge .%=Loop1\n\t" | ||
558 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
559 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
560 | " # If so, we've got 2 more taps to do.\n\t" | ||
561 | " and $1, %%r8\n\t" | ||
562 | " je .%=Leven\n\t" | ||
563 | " # The count was odd, do 2 more taps.\n\t" | ||
564 | " # Note that we've already got mm0/mm2 preloaded\n\t" | ||
565 | " # from the main loop.\n\t" | ||
566 | " movaps 0(%%r9), %%xmm0\n\t" | ||
567 | " movaps %%xmm0, %%xmm4\n\t" | ||
568 | " movaps 0(%%r10), %%xmm2\n\t" | ||
569 | " xorps %%xmm8, %%xmm2\n\t" | ||
570 | " mulps %%xmm2, %%xmm0\n\t" | ||
571 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
572 | " addps %%xmm0, %%xmm6\n\t" | ||
573 | " mulps %%xmm4, %%xmm2\n\t" | ||
574 | " addps %%xmm2, %%xmm7\n\t" | ||
575 | ".%=Leven:\n\t" | ||
576 | " # neg inversor\n\t" | ||
577 | " xorps %%xmm1, %%xmm1\n\t" | ||
578 | " mov $0x80000000, %%r9\n\t" | ||
579 | " movd %%r9, %%xmm1\n\t" | ||
580 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
581 | " # pfpnacc\n\t" | ||
582 | " xorps %%xmm1, %%xmm6\n\t" | ||
583 | " movaps %%xmm6, %%xmm2\n\t" | ||
584 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
585 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
586 | " movaps %%xmm2, %%xmm3\n\t" | ||
587 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
588 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
589 | " addps %%xmm2, %%xmm6\n\t" | ||
590 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
591 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
592 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
593 | " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " | ||
594 | "to memory\n\t" | ||
595 | : | ||
596 | : [rsi] "r"(input), | ||
597 | [rdx] "r"(taps), | ||
598 | "c"(num_bytes), | ||
599 | [rdi] "r"(result), | ||
600 | [conjugator] "r"(conjugator) | ||
601 | : "rax", "r8", "r9", "r10"); | ||
602 | |||
603 | 2 | int getem = num_bytes % 16; | |
604 | |||
605 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; getem > 0; getem -= 8) { |
606 | 2 | *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); | |
607 | } | ||
608 | 2 | } | |
609 | #endif | ||
610 | |||
611 | #if LV_HAVE_SSE && LV_HAVE_32 | ||
612 | static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, | ||
613 | const lv_32fc_t* input, | ||
614 | const lv_32fc_t* taps, | ||
615 | unsigned int num_points) | ||
616 | { | ||
617 | |||
618 | const unsigned int num_bytes = num_points * 8; | ||
619 | |||
620 | __VOLK_ATTR_ALIGNED(16) | ||
621 | static const uint32_t conjugator[4] = { | ||
622 | 0x00000000, 0x80000000, 0x00000000, 0x80000000 | ||
623 | }; | ||
624 | |||
625 | int bound = num_bytes >> 4; | ||
626 | int leftovers = num_bytes % 16; | ||
627 | |||
628 | __VOLK_ASM __VOLK_VOLATILE( | ||
629 | " #pushl %%ebp\n\t" | ||
630 | " #movl %%esp, %%ebp\n\t" | ||
631 | " #movl 12(%%ebp), %%eax # input\n\t" | ||
632 | " #movl 16(%%ebp), %%edx # taps\n\t" | ||
633 | " #movl 20(%%ebp), %%ecx # n_bytes\n\t" | ||
634 | " movaps 0(%[conjugator]), %%xmm1\n\t" | ||
635 | " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" | ||
636 | " movaps 0(%[eax]), %%xmm0\n\t" | ||
637 | " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" | ||
638 | " movaps 0(%[edx]), %%xmm2\n\t" | ||
639 | " movl %[ecx], (%[out])\n\t" | ||
640 | " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" | ||
641 | |||
642 | " xorps %%xmm1, %%xmm2\n\t" | ||
643 | " jmp .%=L1_test\n\t" | ||
644 | " # 4 taps / loop\n\t" | ||
645 | " # something like ?? cycles / loop\n\t" | ||
646 | ".%=Loop1: \n\t" | ||
647 | "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" | ||
648 | "# movaps (%[eax]), %%xmmA\n\t" | ||
649 | "# movaps (%[edx]), %%xmmB\n\t" | ||
650 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
651 | "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" | ||
652 | "# mulps %%xmmB, %%xmmA\n\t" | ||
653 | "# mulps %%xmmZ, %%xmmB\n\t" | ||
654 | "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" | ||
655 | "# xorps %%xmmPN, %%xmmA\n\t" | ||
656 | "# movaps %%xmmA, %%xmmZ\n\t" | ||
657 | "# unpcklps %%xmmB, %%xmmA\n\t" | ||
658 | "# unpckhps %%xmmB, %%xmmZ\n\t" | ||
659 | "# movaps %%xmmZ, %%xmmY\n\t" | ||
660 | "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" | ||
661 | "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" | ||
662 | "# addps %%xmmZ, %%xmmA\n\t" | ||
663 | "# addps %%xmmA, %%xmmC\n\t" | ||
664 | "# A=xmm0, B=xmm2, Z=xmm4\n\t" | ||
665 | "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" | ||
666 | " movaps 16(%[edx]), %%xmm3\n\t" | ||
667 | " movaps %%xmm0, %%xmm4\n\t" | ||
668 | " xorps %%xmm1, %%xmm3\n\t" | ||
669 | " mulps %%xmm2, %%xmm0\n\t" | ||
670 | " movaps 16(%[eax]), %%xmm1\n\t" | ||
671 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
672 | " movaps %%xmm1, %%xmm5\n\t" | ||
673 | " addps %%xmm0, %%xmm6\n\t" | ||
674 | " mulps %%xmm3, %%xmm1\n\t" | ||
675 | " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" | ||
676 | " addps %%xmm1, %%xmm6\n\t" | ||
677 | " movaps 0(%[conjugator]), %%xmm1\n\t" | ||
678 | " mulps %%xmm4, %%xmm2\n\t" | ||
679 | " movaps 32(%[eax]), %%xmm0\n\t" | ||
680 | " addps %%xmm2, %%xmm7\n\t" | ||
681 | " mulps %%xmm5, %%xmm3\n\t" | ||
682 | " addl $32, %[eax]\n\t" | ||
683 | " movaps 32(%[edx]), %%xmm2\n\t" | ||
684 | " addps %%xmm3, %%xmm7\n\t" | ||
685 | " xorps %%xmm1, %%xmm2\n\t" | ||
686 | " addl $32, %[edx]\n\t" | ||
687 | ".%=L1_test:\n\t" | ||
688 | " decl %[ecx]\n\t" | ||
689 | " jge .%=Loop1\n\t" | ||
690 | " # We've handled the bulk of multiplies up to here.\n\t" | ||
691 | " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" | ||
692 | " # If so, we've got 2 more taps to do.\n\t" | ||
693 | " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" | ||
694 | " shrl $4, %[ecx]\n\t" | ||
695 | " andl $1, %[ecx]\n\t" | ||
696 | " je .%=Leven\n\t" | ||
697 | " # The count was odd, do 2 more taps.\n\t" | ||
698 | " # Note that we've already got mm0/mm2 preloaded\n\t" | ||
699 | " # from the main loop.\n\t" | ||
700 | " movaps %%xmm0, %%xmm4\n\t" | ||
701 | " mulps %%xmm2, %%xmm0\n\t" | ||
702 | " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" | ||
703 | " addps %%xmm0, %%xmm6\n\t" | ||
704 | " mulps %%xmm4, %%xmm2\n\t" | ||
705 | " addps %%xmm2, %%xmm7\n\t" | ||
706 | ".%=Leven:\n\t" | ||
707 | " # neg inversor\n\t" | ||
708 | " #movl 8(%%ebp), %[eax] \n\t" | ||
709 | " xorps %%xmm1, %%xmm1\n\t" | ||
710 | " movl $0x80000000, (%[out])\n\t" | ||
711 | " movss (%[out]), %%xmm1\n\t" | ||
712 | " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" | ||
713 | " # pfpnacc\n\t" | ||
714 | " xorps %%xmm1, %%xmm6\n\t" | ||
715 | " movaps %%xmm6, %%xmm2\n\t" | ||
716 | " unpcklps %%xmm7, %%xmm6\n\t" | ||
717 | " unpckhps %%xmm7, %%xmm2\n\t" | ||
718 | " movaps %%xmm2, %%xmm3\n\t" | ||
719 | " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" | ||
720 | " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" | ||
721 | " addps %%xmm2, %%xmm6\n\t" | ||
722 | " # xmm6 = r1 i2 r3 i4\n\t" | ||
723 | " #movl 8(%%ebp), %[eax] # @result\n\t" | ||
724 | " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" | ||
725 | " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" | ||
726 | " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) " | ||
727 | "to memory\n\t" | ||
728 | " #popl %%ebp\n\t" | ||
729 | : | ||
730 | : [eax] "r"(input), | ||
731 | [edx] "r"(taps), | ||
732 | [ecx] "r"(num_bytes), | ||
733 | [out] "r"(result), | ||
734 | [conjugator] "r"(conjugator)); | ||
735 | |||
736 | for (; leftovers > 0; leftovers -= 8) { | ||
737 | *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); | ||
738 | } | ||
739 | } | ||
740 | #endif /*LV_HAVE_SSE*/ | ||
741 | |||
742 | |||
743 | #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ | ||
744 |