| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2012, 2013, 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32fc_32f_dot_prod_32fc | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * This block computes the dot product (or inner product) between two | ||
| 16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
| 17 | * num_points taps, the result is the sum of products between the two | ||
| 18 | * vectors. The result is a single value stored in the \p result | ||
| 19 | * address and will be complex. | ||
| 20 | * | ||
| 21 | * <b>Dispatcher Prototype</b> | ||
| 22 | * \code | ||
| 23 | * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float | ||
| 24 | * * taps, unsigned int num_points) \endcode | ||
| 25 | * | ||
| 26 | * \b Inputs | ||
| 27 | * \li input: vector of complex samples | ||
| 28 | * \li taps: floating point taps | ||
| 29 | * \li num_points: number of samples in both \p input and \p taps | ||
| 30 | * | ||
| 31 | * \b Outputs | ||
| 32 | * \li result: pointer to a complex value to hold the dot product result. | ||
| 33 | * | ||
| 34 | * \b Example | ||
| 35 | * \code | ||
| 36 | * int N = 10000; | ||
| 37 | * lv_32fc_t y; | ||
| 38 | * lv_32fc_t *x = (lv_32fc_t*)volk_malloc(N*sizeof(lv_32fc_t), volk_get_alignment()); | ||
| 39 | * float *t = (float*)volk_malloc(N*sizeof(float), volk_get_alignment()); | ||
| 40 | * | ||
| 41 | * <populate x and t with some values> | ||
| 42 | * | ||
| 43 | * volk_32fc_dot_prod_32fc(&y, x, t, N); | ||
| 44 | * | ||
| 45 | * volk_free(x); | ||
| 46 | * volk_free(t); | ||
| 47 | * \endcode | ||
| 48 | */ | ||
| 49 | |||
| 50 | #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H | ||
| 51 | #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H | ||
| 52 | |||
| 53 | #include <stdio.h> | ||
| 54 | #include <volk/volk_common.h> | ||
| 55 | |||
| 56 | #ifdef LV_HAVE_GENERIC | ||
| 57 | |||
| 58 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, | |
| 59 | const lv_32fc_t* input, | ||
| 60 | const float* taps, | ||
| 61 | unsigned int num_points) | ||
| 62 | { | ||
| 63 | |||
| 64 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 65 | 2 | const float* aPtr = (float*)input; | |
| 66 | 2 | const float* bPtr = taps; | |
| 67 | 2 | unsigned int number = 0; | |
| 68 | |||
| 69 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
| 70 | 262142 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 71 | 262142 | aPtr += 2; | |
| 72 | 262142 | bPtr += 1; | |
| 73 | } | ||
| 74 | |||
| 75 | 2 | *result = returnValue; | |
| 76 | 2 | } | |
| 77 | |||
| 78 | #endif /*LV_HAVE_GENERIC*/ | ||
| 79 | |||
| 80 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
| 81 | |||
| 82 | #include <immintrin.h> | ||
| 83 | |||
| 84 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, | |
| 85 | const lv_32fc_t* input, | ||
| 86 | const float* taps, | ||
| 87 | unsigned int num_points) | ||
| 88 | { | ||
| 89 | |||
| 90 | 2 | unsigned int number = 0; | |
| 91 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
| 92 | |||
| 93 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 94 | 2 | const float* aPtr = (float*)input; | |
| 95 | 2 | const float* bPtr = taps; | |
| 96 | |||
| 97 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 98 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 99 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
| 100 | |||
| 101 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 102 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 103 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 104 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 105 | |||
| 106 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
| 107 | |||
| 108 | 16382 | a0Val = _mm256_load_ps(aPtr); | |
| 109 | 16382 | a1Val = _mm256_load_ps(aPtr + 8); | |
| 110 | 16382 | a2Val = _mm256_load_ps(aPtr + 16); | |
| 111 | 32764 | a3Val = _mm256_load_ps(aPtr + 24); | |
| 112 | |||
| 113 | 16382 | x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
| 114 | 32764 | x1Val = _mm256_load_ps(bPtr + 8); | |
| 115 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
| 116 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
| 117 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
| 118 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
| 119 | |||
| 120 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
| 121 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
| 122 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
| 123 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
| 124 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
| 125 | |||
| 126 | 16382 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
| 127 | 16382 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
| 128 | 16382 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
| 129 | 16382 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
| 130 | |||
| 131 | 16382 | aPtr += 32; | |
| 132 | 16382 | bPtr += 16; | |
| 133 | } | ||
| 134 | |||
| 135 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 136 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 137 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 138 | |||
| 139 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 140 | |||
| 141 | _mm256_store_ps(dotProductVector, | ||
| 142 | dotProdVal0); // Store the results back into the dot product vector | ||
| 143 | |||
| 144 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
| 145 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
| 146 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
| 147 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
| 148 | |||
| 149 | 2 | number = sixteenthPoints * 16; | |
| 150 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
| 151 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 152 | 30 | aPtr += 2; | |
| 153 | 30 | bPtr += 1; | |
| 154 | } | ||
| 155 | |||
| 156 | 2 | *result = returnValue; | |
| 157 | 2 | } | |
| 158 | |||
| 159 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
| 160 | |||
| 161 | #ifdef LV_HAVE_AVX | ||
| 162 | |||
| 163 | #include <immintrin.h> | ||
| 164 | |||
| 165 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result, | |
| 166 | const lv_32fc_t* input, | ||
| 167 | const float* taps, | ||
| 168 | unsigned int num_points) | ||
| 169 | { | ||
| 170 | |||
| 171 | 2 | unsigned int number = 0; | |
| 172 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
| 173 | |||
| 174 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 175 | 2 | const float* aPtr = (float*)input; | |
| 176 | 2 | const float* bPtr = taps; | |
| 177 | |||
| 178 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 179 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 180 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
| 181 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
| 182 | |||
| 183 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 184 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 185 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 186 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 187 | |||
| 188 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
| 189 | |||
| 190 | 16382 | a0Val = _mm256_load_ps(aPtr); | |
| 191 | 16382 | a1Val = _mm256_load_ps(aPtr + 8); | |
| 192 | 16382 | a2Val = _mm256_load_ps(aPtr + 16); | |
| 193 | 32764 | a3Val = _mm256_load_ps(aPtr + 24); | |
| 194 | |||
| 195 | 16382 | x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
| 196 | 32764 | x1Val = _mm256_load_ps(bPtr + 8); | |
| 197 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
| 198 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
| 199 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
| 200 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
| 201 | |||
| 202 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
| 203 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
| 204 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
| 205 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
| 206 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
| 207 | |||
| 208 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
| 209 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
| 210 | 16382 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
| 211 | 16382 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
| 212 | |||
| 213 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
| 214 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
| 215 | 16382 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
| 216 | 16382 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
| 217 | |||
| 218 | 16382 | aPtr += 32; | |
| 219 | 16382 | bPtr += 16; | |
| 220 | } | ||
| 221 | |||
| 222 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 223 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 224 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 225 | |||
| 226 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 227 | |||
| 228 | _mm256_store_ps(dotProductVector, | ||
| 229 | dotProdVal0); // Store the results back into the dot product vector | ||
| 230 | |||
| 231 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
| 232 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
| 233 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
| 234 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
| 235 | |||
| 236 | 2 | number = sixteenthPoints * 16; | |
| 237 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
| 238 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 239 | 30 | aPtr += 2; | |
| 240 | 30 | bPtr += 1; | |
| 241 | } | ||
| 242 | |||
| 243 | 2 | *result = returnValue; | |
| 244 | 2 | } | |
| 245 | |||
| 246 | #endif /*LV_HAVE_AVX*/ | ||
| 247 | |||
| 248 | |||
| 249 | #ifdef LV_HAVE_SSE | ||
| 250 | |||
| 251 | |||
| 252 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result, | |
| 253 | const lv_32fc_t* input, | ||
| 254 | const float* taps, | ||
| 255 | unsigned int num_points) | ||
| 256 | { | ||
| 257 | |||
| 258 | 2 | unsigned int number = 0; | |
| 259 | 2 | const unsigned int eighthPoints = num_points / 8; | |
| 260 | |||
| 261 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 262 | 2 | const float* aPtr = (float*)input; | |
| 263 | 2 | const float* bPtr = taps; | |
| 264 | |||
| 265 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
| 266 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
| 267 | __m128 x0Val, x1Val, x2Val, x3Val; | ||
| 268 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
| 269 | |||
| 270 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
| 271 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
| 272 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
| 273 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
| 274 | |||
| 275 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
| 276 | |||
| 277 | 32766 | a0Val = _mm_load_ps(aPtr); | |
| 278 | 32766 | a1Val = _mm_load_ps(aPtr + 4); | |
| 279 | 32766 | a2Val = _mm_load_ps(aPtr + 8); | |
| 280 | 65532 | a3Val = _mm_load_ps(aPtr + 12); | |
| 281 | |||
| 282 | 32766 | x0Val = _mm_load_ps(bPtr); | |
| 283 | 32766 | x1Val = _mm_load_ps(bPtr); | |
| 284 | 32766 | x2Val = _mm_load_ps(bPtr + 4); | |
| 285 | 65532 | x3Val = _mm_load_ps(bPtr + 4); | |
| 286 | 32766 | b0Val = _mm_unpacklo_ps(x0Val, x1Val); | |
| 287 | 32766 | b1Val = _mm_unpackhi_ps(x0Val, x1Val); | |
| 288 | 32766 | b2Val = _mm_unpacklo_ps(x2Val, x3Val); | |
| 289 | 32766 | b3Val = _mm_unpackhi_ps(x2Val, x3Val); | |
| 290 | |||
| 291 | 32766 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
| 292 | 32766 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
| 293 | 32766 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
| 294 | 32766 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
| 295 | |||
| 296 | 32766 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
| 297 | 32766 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
| 298 | 32766 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
| 299 | 32766 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
| 300 | |||
| 301 | 32766 | aPtr += 16; | |
| 302 | 32766 | bPtr += 8; | |
| 303 | } | ||
| 304 | |||
| 305 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
| 306 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
| 307 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
| 308 | |||
| 309 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
| 310 | |||
| 311 | _mm_store_ps(dotProductVector, | ||
| 312 | dotProdVal0); // Store the results back into the dot product vector | ||
| 313 | |||
| 314 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
| 315 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
| 316 | |||
| 317 | 2 | number = eighthPoints * 8; | |
| 318 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 319 | 14 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 320 | 14 | aPtr += 2; | |
| 321 | 14 | bPtr += 1; | |
| 322 | } | ||
| 323 | |||
| 324 | 2 | *result = returnValue; | |
| 325 | 2 | } | |
| 326 | |||
| 327 | #endif /*LV_HAVE_SSE*/ | ||
| 328 | |||
| 329 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
| 330 | |||
| 331 | #include <immintrin.h> | ||
| 332 | |||
| 333 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, | |
| 334 | const lv_32fc_t* input, | ||
| 335 | const float* taps, | ||
| 336 | unsigned int num_points) | ||
| 337 | { | ||
| 338 | |||
| 339 | 2 | unsigned int number = 0; | |
| 340 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
| 341 | |||
| 342 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 343 | 2 | const float* aPtr = (float*)input; | |
| 344 | 2 | const float* bPtr = taps; | |
| 345 | |||
| 346 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 347 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 348 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
| 349 | |||
| 350 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 351 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 352 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 353 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 354 | |||
| 355 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
| 356 | |||
| 357 | 16382 | a0Val = _mm256_loadu_ps(aPtr); | |
| 358 | 16382 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
| 359 | 16382 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
| 360 | 32764 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
| 361 | |||
| 362 | 16382 | x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
| 363 | 32764 | x1Val = _mm256_loadu_ps(bPtr + 8); | |
| 364 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
| 365 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
| 366 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
| 367 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
| 368 | |||
| 369 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
| 370 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
| 371 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
| 372 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
| 373 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
| 374 | |||
| 375 | 16382 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
| 376 | 16382 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
| 377 | 16382 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
| 378 | 16382 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
| 379 | |||
| 380 | 16382 | aPtr += 32; | |
| 381 | 16382 | bPtr += 16; | |
| 382 | } | ||
| 383 | |||
| 384 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 385 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 386 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 387 | |||
| 388 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 389 | |||
| 390 | _mm256_store_ps(dotProductVector, | ||
| 391 | dotProdVal0); // Store the results back into the dot product vector | ||
| 392 | |||
| 393 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
| 394 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
| 395 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
| 396 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
| 397 | |||
| 398 | 2 | number = sixteenthPoints * 16; | |
| 399 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
| 400 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 401 | 30 | aPtr += 2; | |
| 402 | 30 | bPtr += 1; | |
| 403 | } | ||
| 404 | |||
| 405 | 2 | *result = returnValue; | |
| 406 | 2 | } | |
| 407 | |||
| 408 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
| 409 | |||
| 410 | #ifdef LV_HAVE_AVX | ||
| 411 | |||
| 412 | #include <immintrin.h> | ||
| 413 | |||
| 414 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result, | |
| 415 | const lv_32fc_t* input, | ||
| 416 | const float* taps, | ||
| 417 | unsigned int num_points) | ||
| 418 | { | ||
| 419 | |||
| 420 | 2 | unsigned int number = 0; | |
| 421 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
| 422 | |||
| 423 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 424 | 2 | const float* aPtr = (float*)input; | |
| 425 | 2 | const float* bPtr = taps; | |
| 426 | |||
| 427 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 428 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 429 | __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; | ||
| 430 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
| 431 | |||
| 432 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 433 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 434 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 435 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 436 | |||
| 437 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
| 438 | |||
| 439 | 16382 | a0Val = _mm256_loadu_ps(aPtr); | |
| 440 | 16382 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
| 441 | 16382 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
| 442 | 32764 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
| 443 | |||
| 444 | 16382 | x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 | |
| 445 | 32764 | x1Val = _mm256_loadu_ps(bPtr + 8); | |
| 446 | 16382 | x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 | |
| 447 | 16382 | x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 | |
| 448 | 16382 | x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); | |
| 449 | 16382 | x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); | |
| 450 | |||
| 451 | // TODO: it may be possible to rearrange swizzling to better pipeline data | ||
| 452 | 16382 | b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 | |
| 453 | 16382 | b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 | |
| 454 | 16382 | b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); | |
| 455 | 16382 | b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); | |
| 456 | |||
| 457 | 16382 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
| 458 | 16382 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
| 459 | 16382 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
| 460 | 16382 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
| 461 | |||
| 462 | 16382 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
| 463 | 16382 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
| 464 | 16382 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
| 465 | 16382 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
| 466 | |||
| 467 | 16382 | aPtr += 32; | |
| 468 | 16382 | bPtr += 16; | |
| 469 | } | ||
| 470 | |||
| 471 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 472 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 473 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 474 | |||
| 475 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 476 | |||
| 477 | _mm256_store_ps(dotProductVector, | ||
| 478 | dotProdVal0); // Store the results back into the dot product vector | ||
| 479 | |||
| 480 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
| 481 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
| 482 | 2 | returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]); | |
| 483 | 2 | returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]); | |
| 484 | |||
| 485 | 2 | number = sixteenthPoints * 16; | |
| 486 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
| 487 | 30 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 488 | 30 | aPtr += 2; | |
| 489 | 30 | bPtr += 1; | |
| 490 | } | ||
| 491 | |||
| 492 | 2 | *result = returnValue; | |
| 493 | 2 | } | |
| 494 | #endif /*LV_HAVE_AVX*/ | ||
| 495 | |||
| 496 | #ifdef LV_HAVE_NEON | ||
| 497 | #include <arm_neon.h> | ||
| 498 | |||
| 499 | static inline void | ||
| 500 | volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result, | ||
| 501 | const lv_32fc_t* __restrict input, | ||
| 502 | const float* __restrict taps, | ||
| 503 | unsigned int num_points) | ||
| 504 | { | ||
| 505 | |||
| 506 | unsigned int number; | ||
| 507 | const unsigned int quarterPoints = num_points / 8; | ||
| 508 | |||
| 509 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | ||
| 510 | const float* inputPtr = (float*)input; | ||
| 511 | const float* tapsPtr = taps; | ||
| 512 | float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; | ||
| 513 | float accVector_real[4]; | ||
| 514 | float accVector_imag[4]; | ||
| 515 | |||
| 516 | float32x4x2_t inputVector0, inputVector1; | ||
| 517 | float32x4_t tapsVector0, tapsVector1; | ||
| 518 | float32x4_t tmp_real0, tmp_imag0; | ||
| 519 | float32x4_t tmp_real1, tmp_imag1; | ||
| 520 | float32x4_t real_accumulator0, imag_accumulator0; | ||
| 521 | float32x4_t real_accumulator1, imag_accumulator1; | ||
| 522 | |||
| 523 | // zero out accumulators | ||
| 524 | // take a *float, return float32x4_t | ||
| 525 | real_accumulator0 = vld1q_f32(zero); | ||
| 526 | imag_accumulator0 = vld1q_f32(zero); | ||
| 527 | real_accumulator1 = vld1q_f32(zero); | ||
| 528 | imag_accumulator1 = vld1q_f32(zero); | ||
| 529 | |||
| 530 | for (number = 0; number < quarterPoints; number++) { | ||
| 531 | // load doublewords and duplicate in to second lane | ||
| 532 | tapsVector0 = vld1q_f32(tapsPtr); | ||
| 533 | tapsVector1 = vld1q_f32(tapsPtr + 4); | ||
| 534 | |||
| 535 | // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag | ||
| 536 | inputVector0 = vld2q_f32(inputPtr); | ||
| 537 | inputVector1 = vld2q_f32(inputPtr + 8); | ||
| 538 | // inputVector is now a struct of two vectors, 0th is real, 1st is imag | ||
| 539 | |||
| 540 | tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); | ||
| 541 | tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); | ||
| 542 | |||
| 543 | tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); | ||
| 544 | tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); | ||
| 545 | |||
| 546 | real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); | ||
| 547 | imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); | ||
| 548 | |||
| 549 | real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); | ||
| 550 | imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); | ||
| 551 | |||
| 552 | tapsPtr += 8; | ||
| 553 | inputPtr += 16; | ||
| 554 | } | ||
| 555 | |||
| 556 | real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1); | ||
| 557 | imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1); | ||
| 558 | // void vst1q_f32( float32_t * ptr, float32x4_t val); | ||
| 559 | // store results back to a complex (array of 2 floats) | ||
| 560 | vst1q_f32(accVector_real, real_accumulator0); | ||
| 561 | vst1q_f32(accVector_imag, imag_accumulator0); | ||
| 562 | returnValue += lv_cmake( | ||
| 563 | accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3], | ||
| 564 | accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]); | ||
| 565 | |||
| 566 | // clean up the remainder | ||
| 567 | for (number = quarterPoints * 8; number < num_points; number++) { | ||
| 568 | returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]); | ||
| 569 | inputPtr += 2; | ||
| 570 | tapsPtr += 1; | ||
| 571 | } | ||
| 572 | |||
| 573 | *result = returnValue; | ||
| 574 | } | ||
| 575 | |||
| 576 | #endif /*LV_HAVE_NEON*/ | ||
| 577 | |||
| 578 | #ifdef LV_HAVE_NEON | ||
| 579 | #include <arm_neon.h> | ||
| 580 | |||
| 581 | static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result, | ||
| 582 | const lv_32fc_t* __restrict input, | ||
| 583 | const float* __restrict taps, | ||
| 584 | unsigned int num_points) | ||
| 585 | { | ||
| 586 | |||
| 587 | unsigned int number; | ||
| 588 | const unsigned int quarterPoints = num_points / 4; | ||
| 589 | |||
| 590 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | ||
| 591 | const float* inputPtr = (float*)input; | ||
| 592 | const float* tapsPtr = taps; | ||
| 593 | float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; | ||
| 594 | float accVector_real[4]; | ||
| 595 | float accVector_imag[4]; | ||
| 596 | |||
| 597 | float32x4x2_t inputVector; | ||
| 598 | float32x4_t tapsVector; | ||
| 599 | float32x4_t tmp_real, tmp_imag; | ||
| 600 | float32x4_t real_accumulator, imag_accumulator; | ||
| 601 | |||
| 602 | |||
| 603 | // zero out accumulators | ||
| 604 | // take a *float, return float32x4_t | ||
| 605 | real_accumulator = vld1q_f32(zero); | ||
| 606 | imag_accumulator = vld1q_f32(zero); | ||
| 607 | |||
| 608 | for (number = 0; number < quarterPoints; number++) { | ||
| 609 | // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) | ||
| 610 | // load doublewords and duplicate in to second lane | ||
| 611 | tapsVector = vld1q_f32(tapsPtr); | ||
| 612 | |||
| 613 | // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag | ||
| 614 | inputVector = vld2q_f32(inputPtr); | ||
| 615 | |||
| 616 | tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); | ||
| 617 | tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); | ||
| 618 | |||
| 619 | real_accumulator = vaddq_f32(real_accumulator, tmp_real); | ||
| 620 | imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); | ||
| 621 | |||
| 622 | |||
| 623 | tapsPtr += 4; | ||
| 624 | inputPtr += 8; | ||
| 625 | } | ||
| 626 | |||
| 627 | // store results back to a complex (array of 2 floats) | ||
| 628 | vst1q_f32(accVector_real, real_accumulator); | ||
| 629 | vst1q_f32(accVector_imag, imag_accumulator); | ||
| 630 | returnValue += lv_cmake( | ||
| 631 | accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3], | ||
| 632 | accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]); | ||
| 633 | |||
| 634 | // clean up the remainder | ||
| 635 | for (number = quarterPoints * 4; number < num_points; number++) { | ||
| 636 | returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]); | ||
| 637 | inputPtr += 2; | ||
| 638 | tapsPtr += 1; | ||
| 639 | } | ||
| 640 | |||
| 641 | *result = returnValue; | ||
| 642 | } | ||
| 643 | |||
| 644 | #endif /*LV_HAVE_NEON*/ | ||
| 645 | |||
| 646 | #ifdef LV_HAVE_NEONV7 | ||
| 647 | extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result, | ||
| 648 | const lv_32fc_t* input, | ||
| 649 | const float* taps, | ||
| 650 | unsigned int num_points); | ||
| 651 | #endif /*LV_HAVE_NEONV7*/ | ||
| 652 | |||
| 653 | #ifdef LV_HAVE_NEONV7 | ||
| 654 | extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result, | ||
| 655 | const lv_32fc_t* input, | ||
| 656 | const float* taps, | ||
| 657 | unsigned int num_points); | ||
| 658 | #endif /*LV_HAVE_NEONV7*/ | ||
| 659 | |||
| 660 | #ifdef LV_HAVE_NEONV7 | ||
| 661 | extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result, | ||
| 662 | const lv_32fc_t* input, | ||
| 663 | const float* taps, | ||
| 664 | unsigned int num_points); | ||
| 665 | #endif /*LV_HAVE_NEONV7*/ | ||
| 666 | |||
| 667 | #ifdef LV_HAVE_SSE | ||
| 668 | |||
| 669 | 2 | static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result, | |
| 670 | const lv_32fc_t* input, | ||
| 671 | const float* taps, | ||
| 672 | unsigned int num_points) | ||
| 673 | { | ||
| 674 | |||
| 675 | 2 | unsigned int number = 0; | |
| 676 | 2 | const unsigned int eighthPoints = num_points / 8; | |
| 677 | |||
| 678 | 2 | lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f); | |
| 679 | 2 | const float* aPtr = (float*)input; | |
| 680 | 2 | const float* bPtr = taps; | |
| 681 | |||
| 682 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
| 683 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
| 684 | __m128 x0Val, x1Val, x2Val, x3Val; | ||
| 685 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
| 686 | |||
| 687 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
| 688 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
| 689 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
| 690 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
| 691 | |||
| 692 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < eighthPoints; number++) { |
| 693 | |||
| 694 | 32766 | a0Val = _mm_loadu_ps(aPtr); | |
| 695 | 32766 | a1Val = _mm_loadu_ps(aPtr + 4); | |
| 696 | 32766 | a2Val = _mm_loadu_ps(aPtr + 8); | |
| 697 | 65532 | a3Val = _mm_loadu_ps(aPtr + 12); | |
| 698 | |||
| 699 | 32766 | x0Val = _mm_loadu_ps(bPtr); | |
| 700 | 32766 | x1Val = _mm_loadu_ps(bPtr); | |
| 701 | 32766 | x2Val = _mm_loadu_ps(bPtr + 4); | |
| 702 | 65532 | x3Val = _mm_loadu_ps(bPtr + 4); | |
| 703 | 32766 | b0Val = _mm_unpacklo_ps(x0Val, x1Val); | |
| 704 | 32766 | b1Val = _mm_unpackhi_ps(x0Val, x1Val); | |
| 705 | 32766 | b2Val = _mm_unpacklo_ps(x2Val, x3Val); | |
| 706 | 32766 | b3Val = _mm_unpackhi_ps(x2Val, x3Val); | |
| 707 | |||
| 708 | 32766 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
| 709 | 32766 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
| 710 | 32766 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
| 711 | 32766 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
| 712 | |||
| 713 | 32766 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
| 714 | 32766 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
| 715 | 32766 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
| 716 | 32766 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
| 717 | |||
| 718 | 32766 | aPtr += 16; | |
| 719 | 32766 | bPtr += 8; | |
| 720 | } | ||
| 721 | |||
| 722 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
| 723 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
| 724 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
| 725 | |||
| 726 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
| 727 | |||
| 728 | _mm_store_ps(dotProductVector, | ||
| 729 | dotProdVal0); // Store the results back into the dot product vector | ||
| 730 | |||
| 731 | 2 | returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]); | |
| 732 | 2 | returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]); | |
| 733 | |||
| 734 | 2 | number = eighthPoints * 8; | |
| 735 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 736 | 14 | returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]); | |
| 737 | 14 | aPtr += 2; | |
| 738 | 14 | bPtr += 1; | |
| 739 | } | ||
| 740 | |||
| 741 | 2 | *result = returnValue; | |
| 742 | 2 | } | |
| 743 | |||
| 744 | #endif /*LV_HAVE_SSE*/ | ||
| 745 | |||
| 746 | |||
| 747 | #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/ | ||
| 748 |