| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2019 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32fc_accumulator_s32fc | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * Accumulates the values in the input buffer. | ||
| 16 | * | ||
| 17 | * <b>Dispatcher Prototype</b> | ||
| 18 | * \code | ||
| 19 | * void volk_32fc_accumulator_s32fc(lv_32fc_t* result, const lv_32fc_t* inputBuffer, | ||
| 20 | * unsigned int num_points) \endcode | ||
| 21 | * | ||
| 22 | * \b Inputs | ||
| 23 | * \li inputBuffer: The buffer of data to be accumulated | ||
| 24 | * \li num_points: The number of data points. | ||
| 25 | * | ||
| 26 | * \b Outputs | ||
| 27 | * \li result: The accumulated result. | ||
| 28 | * | ||
| 29 | * \b Example | ||
| 30 | * Calculate the sum of numbers 0 through 99 | ||
| 31 | * \code | ||
| 32 | * int N = 100; | ||
| 33 | * unsigned int alignment = volk_get_alignment(); | ||
| 34 | * lv_32fc_t* vec = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t)*N, alignment); | ||
| 35 | * lv_32fc_t* out = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t), alignment); | ||
| 36 | * | ||
| 37 | * for(unsigned int ii = 0; ii < N; ++ii){ | ||
| 38 | * vec[ii] = lv_cmake( (float) ii, (float) -ii ); | ||
| 39 | * } | ||
| 40 | * | ||
| 41 | * volk_32fc_accumulator_s32fc(out, vec, N); | ||
| 42 | * | ||
| 43 | * printf("sum(0..99)+1j*sum(0..-99) = %1.2f %1.2f \n", lv_creal(*out) , lv_cimag(*out) | ||
| 44 | * ); | ||
| 45 | * | ||
| 46 | * volk_free(vec); | ||
| 47 | * volk_free(out); | ||
| 48 | * \endcode | ||
| 49 | */ | ||
| 50 | |||
| 51 | #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H | ||
| 52 | #define INCLUDED_volk_32fc_accumulator_s32fc_a_H | ||
| 53 | |||
| 54 | #include <inttypes.h> | ||
| 55 | #include <volk/volk_common.h> | ||
| 56 | |||
| 57 | #ifdef LV_HAVE_GENERIC | ||
| 58 | 2 | static inline void volk_32fc_accumulator_s32fc_generic(lv_32fc_t* result, | |
| 59 | const lv_32fc_t* inputBuffer, | ||
| 60 | unsigned int num_points) | ||
| 61 | { | ||
| 62 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
| 63 | 2 | unsigned int number = 0; | |
| 64 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
| 65 | |||
| 66 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (; number < num_points; number++) { |
| 67 | 262142 | returnValue += (*aPtr++); | |
| 68 | } | ||
| 69 | 2 | *result = returnValue; | |
| 70 | 2 | } | |
| 71 | #endif /* LV_HAVE_GENERIC */ | ||
| 72 | |||
| 73 | #ifdef LV_HAVE_AVX | ||
| 74 | #include <immintrin.h> | ||
| 75 | |||
| 76 | 2 | static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result, | |
| 77 | const lv_32fc_t* inputBuffer, | ||
| 78 | unsigned int num_points) | ||
| 79 | { | ||
| 80 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
| 81 | 2 | unsigned int number = 0; | |
| 82 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 83 | |||
| 84 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
| 85 | __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; | ||
| 86 | |||
| 87 | 2 | __m256 accumulator = _mm256_setzero_ps(); | |
| 88 | 2 | __m256 aVal = _mm256_setzero_ps(); | |
| 89 | |||
| 90 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 91 | 65534 | aVal = _mm256_loadu_ps((float*)aPtr); | |
| 92 | 65534 | accumulator = _mm256_add_ps(accumulator, aVal); | |
| 93 | 65534 | aPtr += 4; | |
| 94 | } | ||
| 95 | |||
| 96 | _mm256_store_ps(tempBuffer, accumulator); | ||
| 97 | |||
| 98 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
| 99 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
| 100 | 2 | returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]); | |
| 101 | 2 | returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]); | |
| 102 | |||
| 103 | 2 | number = quarterPoints * 4; | |
| 104 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
| 105 | 6 | returnValue += (*aPtr++); | |
| 106 | } | ||
| 107 | 2 | *result = returnValue; | |
| 108 | 2 | } | |
| 109 | #endif /* LV_HAVE_AVX */ | ||
| 110 | |||
| 111 | #ifdef LV_HAVE_SSE | ||
| 112 | #include <xmmintrin.h> | ||
| 113 | |||
| 114 | 2 | static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result, | |
| 115 | const lv_32fc_t* inputBuffer, | ||
| 116 | unsigned int num_points) | ||
| 117 | { | ||
| 118 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
| 119 | 2 | unsigned int number = 0; | |
| 120 | 2 | const unsigned int halfPoints = num_points / 2; | |
| 121 | |||
| 122 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
| 123 | __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; | ||
| 124 | |||
| 125 | 2 | __m128 accumulator = _mm_setzero_ps(); | |
| 126 | 2 | __m128 aVal = _mm_setzero_ps(); | |
| 127 | |||
| 128 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
| 129 | 131070 | aVal = _mm_loadu_ps((float*)aPtr); | |
| 130 | 131070 | accumulator = _mm_add_ps(accumulator, aVal); | |
| 131 | 131070 | aPtr += 2; | |
| 132 | } | ||
| 133 | |||
| 134 | _mm_store_ps(tempBuffer, accumulator); | ||
| 135 | |||
| 136 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
| 137 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
| 138 | |||
| 139 | 2 | number = halfPoints * 2; | |
| 140 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
| 141 | 2 | returnValue += (*aPtr++); | |
| 142 | } | ||
| 143 | 2 | *result = returnValue; | |
| 144 | 2 | } | |
| 145 | #endif /* LV_HAVE_SSE */ | ||
| 146 | |||
| 147 | #ifdef LV_HAVE_AVX | ||
| 148 | #include <immintrin.h> | ||
| 149 | |||
| 150 | 2 | static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result, | |
| 151 | const lv_32fc_t* inputBuffer, | ||
| 152 | unsigned int num_points) | ||
| 153 | { | ||
| 154 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
| 155 | 2 | unsigned int number = 0; | |
| 156 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 157 | |||
| 158 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
| 159 | __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; | ||
| 160 | |||
| 161 | 2 | __m256 accumulator = _mm256_setzero_ps(); | |
| 162 | 2 | __m256 aVal = _mm256_setzero_ps(); | |
| 163 | |||
| 164 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 165 | 65534 | aVal = _mm256_load_ps((float*)aPtr); | |
| 166 | 65534 | accumulator = _mm256_add_ps(accumulator, aVal); | |
| 167 | 65534 | aPtr += 4; | |
| 168 | } | ||
| 169 | |||
| 170 | _mm256_store_ps(tempBuffer, accumulator); | ||
| 171 | |||
| 172 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
| 173 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
| 174 | 2 | returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]); | |
| 175 | 2 | returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]); | |
| 176 | |||
| 177 | 2 | number = quarterPoints * 4; | |
| 178 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
| 179 | 6 | returnValue += (*aPtr++); | |
| 180 | } | ||
| 181 | 2 | *result = returnValue; | |
| 182 | 2 | } | |
| 183 | #endif /* LV_HAVE_AVX */ | ||
| 184 | |||
| 185 | #ifdef LV_HAVE_SSE | ||
| 186 | #include <xmmintrin.h> | ||
| 187 | |||
| 188 | 2 | static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result, | |
| 189 | const lv_32fc_t* inputBuffer, | ||
| 190 | unsigned int num_points) | ||
| 191 | { | ||
| 192 | 2 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | |
| 193 | 2 | unsigned int number = 0; | |
| 194 | 2 | const unsigned int halfPoints = num_points / 2; | |
| 195 | |||
| 196 | 2 | const lv_32fc_t* aPtr = inputBuffer; | |
| 197 | __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; | ||
| 198 | |||
| 199 | 2 | __m128 accumulator = _mm_setzero_ps(); | |
| 200 | 2 | __m128 aVal = _mm_setzero_ps(); | |
| 201 | |||
| 202 |
2/2✓ Branch 0 taken 131070 times.
✓ Branch 1 taken 2 times.
|
131072 | for (; number < halfPoints; number++) { |
| 203 | 131070 | aVal = _mm_load_ps((float*)aPtr); | |
| 204 | 131070 | accumulator = _mm_add_ps(accumulator, aVal); | |
| 205 | 131070 | aPtr += 2; | |
| 206 | } | ||
| 207 | |||
| 208 | _mm_store_ps(tempBuffer, accumulator); | ||
| 209 | |||
| 210 | 2 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | |
| 211 | 2 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | |
| 212 | |||
| 213 | 2 | number = halfPoints * 2; | |
| 214 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | for (; number < num_points; number++) { |
| 215 | 2 | returnValue += (*aPtr++); | |
| 216 | } | ||
| 217 | 2 | *result = returnValue; | |
| 218 | 2 | } | |
| 219 | #endif /* LV_HAVE_SSE */ | ||
| 220 | |||
| 221 | #ifdef LV_HAVE_NEON | ||
| 222 | #include <arm_neon.h> | ||
| 223 | static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result, | ||
| 224 | const lv_32fc_t* inputBuffer, | ||
| 225 | unsigned int num_points) | ||
| 226 | { | ||
| 227 | const lv_32fc_t* aPtr = inputBuffer; | ||
| 228 | unsigned int number = 0; | ||
| 229 | lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | ||
| 230 | unsigned int eighthPoints = num_points / 8; | ||
| 231 | float32x4_t in_vec; | ||
| 232 | float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f }; | ||
| 233 | float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f }; | ||
| 234 | float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f }; | ||
| 235 | float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f }; | ||
| 236 | __VOLK_ATTR_ALIGNED(32) float tempBuffer[4]; | ||
| 237 | |||
| 238 | for (; number < eighthPoints; number++) { | ||
| 239 | in_vec = vld1q_f32((float*)aPtr); | ||
| 240 | out_vec0 = vaddq_f32(in_vec, out_vec0); | ||
| 241 | aPtr += 2; | ||
| 242 | |||
| 243 | in_vec = vld1q_f32((float*)aPtr); | ||
| 244 | out_vec1 = vaddq_f32(in_vec, out_vec1); | ||
| 245 | aPtr += 2; | ||
| 246 | |||
| 247 | in_vec = vld1q_f32((float*)aPtr); | ||
| 248 | out_vec2 = vaddq_f32(in_vec, out_vec2); | ||
| 249 | aPtr += 2; | ||
| 250 | |||
| 251 | in_vec = vld1q_f32((float*)aPtr); | ||
| 252 | out_vec3 = vaddq_f32(in_vec, out_vec3); | ||
| 253 | aPtr += 2; | ||
| 254 | } | ||
| 255 | vst1q_f32(tempBuffer, out_vec0); | ||
| 256 | returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
| 257 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
| 258 | |||
| 259 | vst1q_f32(tempBuffer, out_vec1); | ||
| 260 | returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
| 261 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
| 262 | |||
| 263 | vst1q_f32(tempBuffer, out_vec2); | ||
| 264 | returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
| 265 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
| 266 | |||
| 267 | vst1q_f32(tempBuffer, out_vec3); | ||
| 268 | returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]); | ||
| 269 | returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]); | ||
| 270 | |||
| 271 | number = eighthPoints * 8; | ||
| 272 | for (; number < num_points; number++) { | ||
| 273 | returnValue += (*aPtr++); | ||
| 274 | } | ||
| 275 | *result = returnValue; | ||
| 276 | } | ||
| 277 | #endif /* LV_HAVE_NEON */ | ||
| 278 | |||
| 279 | #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */ | ||
| 280 |