| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | /*! | ||
| 11 | * \page volk_32f_x2_dot_prod_16i | ||
| 12 | * | ||
| 13 | * \b Overview | ||
| 14 | * | ||
| 15 | * This block computes the dot product (or inner product) between two | ||
| 16 | * vectors, the \p input and \p taps vectors. Given a set of \p | ||
| 17 | * num_points taps, the result is the sum of products between the two | ||
| 18 | * vectors. The result is a single value stored in the \p result | ||
| 19 | * address and is conerted to a fixed-point short. | ||
| 20 | * | ||
| 21 | * <b>Dispatcher Prototype</b> | ||
| 22 | * \code | ||
| 23 | * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, | ||
| 24 | * unsigned int num_points) \endcode | ||
| 25 | * | ||
| 26 | * \b Inputs | ||
| 27 | * \li input: vector of floats. | ||
| 28 | * \li taps: float taps. | ||
| 29 | * \li num_points: number of samples in both \p input and \p taps. | ||
| 30 | * | ||
| 31 | * \b Outputs | ||
| 32 | * \li result: pointer to a short value to hold the dot product result. | ||
| 33 | * | ||
| 34 | * \b Example | ||
| 35 | * \code | ||
| 36 | * int N = 10000; | ||
| 37 | * | ||
| 38 | * <FIXME> | ||
| 39 | * | ||
| 40 | * volk_32f_x2_dot_prod_16i(); | ||
| 41 | * | ||
| 42 | * \endcode | ||
| 43 | */ | ||
| 44 | |||
| 45 | #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H | ||
| 46 | #define INCLUDED_volk_32f_x2_dot_prod_16i_H | ||
| 47 | |||
| 48 | #include <stdio.h> | ||
| 49 | #include <volk/volk_common.h> | ||
| 50 | |||
| 51 | |||
| 52 | #ifdef LV_HAVE_GENERIC | ||
| 53 | |||
| 54 | |||
| 55 | 2 | static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, | |
| 56 | const float* input, | ||
| 57 | const float* taps, | ||
| 58 | unsigned int num_points) | ||
| 59 | { | ||
| 60 | |||
| 61 | 2 | float dotProduct = 0; | |
| 62 | 2 | const float* aPtr = input; | |
| 63 | 2 | const float* bPtr = taps; | |
| 64 | 2 | unsigned int number = 0; | |
| 65 | |||
| 66 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
| 67 | 262142 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 68 | } | ||
| 69 | |||
| 70 | 2 | *result = (int16_t)dotProduct; | |
| 71 | 2 | } | |
| 72 | |||
| 73 | #endif /*LV_HAVE_GENERIC*/ | ||
| 74 | |||
| 75 | |||
| 76 | #ifdef LV_HAVE_SSE | ||
| 77 | |||
| 78 | 2 | static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, | |
| 79 | const float* input, | ||
| 80 | const float* taps, | ||
| 81 | unsigned int num_points) | ||
| 82 | { | ||
| 83 | |||
| 84 | 2 | unsigned int number = 0; | |
| 85 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
| 86 | |||
| 87 | 2 | float dotProduct = 0; | |
| 88 | 2 | const float* aPtr = input; | |
| 89 | 2 | const float* bPtr = taps; | |
| 90 | |||
| 91 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
| 92 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
| 93 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
| 94 | |||
| 95 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
| 96 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
| 97 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
| 98 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
| 99 | |||
| 100 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
| 101 | |||
| 102 | 16382 | a0Val = _mm_load_ps(aPtr); | |
| 103 | 16382 | a1Val = _mm_load_ps(aPtr + 4); | |
| 104 | 16382 | a2Val = _mm_load_ps(aPtr + 8); | |
| 105 | 32764 | a3Val = _mm_load_ps(aPtr + 12); | |
| 106 | 16382 | b0Val = _mm_load_ps(bPtr); | |
| 107 | 16382 | b1Val = _mm_load_ps(bPtr + 4); | |
| 108 | 16382 | b2Val = _mm_load_ps(bPtr + 8); | |
| 109 | 32764 | b3Val = _mm_load_ps(bPtr + 12); | |
| 110 | |||
| 111 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
| 112 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
| 113 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
| 114 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
| 115 | |||
| 116 | 16382 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
| 117 | 16382 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
| 118 | 16382 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
| 119 | 16382 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
| 120 | |||
| 121 | 16382 | aPtr += 16; | |
| 122 | 16382 | bPtr += 16; | |
| 123 | } | ||
| 124 | |||
| 125 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
| 126 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
| 127 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
| 128 | |||
| 129 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
| 130 | |||
| 131 | _mm_store_ps(dotProductVector, | ||
| 132 | dotProdVal0); // Store the results back into the dot product vector | ||
| 133 | |||
| 134 | 2 | dotProduct = dotProductVector[0]; | |
| 135 | 2 | dotProduct += dotProductVector[1]; | |
| 136 | 2 | dotProduct += dotProductVector[2]; | |
| 137 | 2 | dotProduct += dotProductVector[3]; | |
| 138 | |||
| 139 | 2 | number = sixteenthPoints * 16; | |
| 140 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
| 141 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 142 | } | ||
| 143 | |||
| 144 | 2 | *result = (short)dotProduct; | |
| 145 | 2 | } | |
| 146 | |||
| 147 | #endif /*LV_HAVE_SSE*/ | ||
| 148 | |||
| 149 | |||
| 150 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
| 151 | |||
| 152 | 2 | static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, | |
| 153 | const float* input, | ||
| 154 | const float* taps, | ||
| 155 | unsigned int num_points) | ||
| 156 | { | ||
| 157 | |||
| 158 | 2 | unsigned int number = 0; | |
| 159 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
| 160 | |||
| 161 | 2 | float dotProduct = 0; | |
| 162 | 2 | const float* aPtr = input; | |
| 163 | 2 | const float* bPtr = taps; | |
| 164 | |||
| 165 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 166 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 167 | |||
| 168 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 169 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 170 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 171 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 172 | |||
| 173 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
| 174 | |||
| 175 | 8190 | a0Val = _mm256_load_ps(aPtr); | |
| 176 | 8190 | a1Val = _mm256_load_ps(aPtr + 8); | |
| 177 | 8190 | a2Val = _mm256_load_ps(aPtr + 16); | |
| 178 | 16380 | a3Val = _mm256_load_ps(aPtr + 24); | |
| 179 | 8190 | b0Val = _mm256_load_ps(bPtr); | |
| 180 | 8190 | b1Val = _mm256_load_ps(bPtr + 8); | |
| 181 | 8190 | b2Val = _mm256_load_ps(bPtr + 16); | |
| 182 | 16380 | b3Val = _mm256_load_ps(bPtr + 24); | |
| 183 | |||
| 184 | 8190 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
| 185 | 8190 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
| 186 | 8190 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
| 187 | 8190 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
| 188 | |||
| 189 | 8190 | aPtr += 32; | |
| 190 | 8190 | bPtr += 32; | |
| 191 | } | ||
| 192 | |||
| 193 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 194 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 195 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 196 | |||
| 197 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 198 | |||
| 199 | _mm256_store_ps(dotProductVector, | ||
| 200 | dotProdVal0); // Store the results back into the dot product vector | ||
| 201 | |||
| 202 | 2 | dotProduct = dotProductVector[0]; | |
| 203 | 2 | dotProduct += dotProductVector[1]; | |
| 204 | 2 | dotProduct += dotProductVector[2]; | |
| 205 | 2 | dotProduct += dotProductVector[3]; | |
| 206 | 2 | dotProduct += dotProductVector[4]; | |
| 207 | 2 | dotProduct += dotProductVector[5]; | |
| 208 | 2 | dotProduct += dotProductVector[6]; | |
| 209 | 2 | dotProduct += dotProductVector[7]; | |
| 210 | |||
| 211 | 2 | number = thirtysecondPoints * 32; | |
| 212 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
| 213 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 214 | } | ||
| 215 | |||
| 216 | 2 | *result = (short)dotProduct; | |
| 217 | 2 | } | |
| 218 | |||
| 219 | #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ | ||
| 220 | |||
| 221 | |||
| 222 | #ifdef LV_HAVE_AVX | ||
| 223 | |||
| 224 | 2 | static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, | |
| 225 | const float* input, | ||
| 226 | const float* taps, | ||
| 227 | unsigned int num_points) | ||
| 228 | { | ||
| 229 | |||
| 230 | 2 | unsigned int number = 0; | |
| 231 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
| 232 | |||
| 233 | 2 | float dotProduct = 0; | |
| 234 | 2 | const float* aPtr = input; | |
| 235 | 2 | const float* bPtr = taps; | |
| 236 | |||
| 237 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 238 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 239 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
| 240 | |||
| 241 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 242 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 243 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 244 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 245 | |||
| 246 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
| 247 | |||
| 248 | 8190 | a0Val = _mm256_load_ps(aPtr); | |
| 249 | 8190 | a1Val = _mm256_load_ps(aPtr + 8); | |
| 250 | 8190 | a2Val = _mm256_load_ps(aPtr + 16); | |
| 251 | 16380 | a3Val = _mm256_load_ps(aPtr + 24); | |
| 252 | 8190 | b0Val = _mm256_load_ps(bPtr); | |
| 253 | 8190 | b1Val = _mm256_load_ps(bPtr + 8); | |
| 254 | 8190 | b2Val = _mm256_load_ps(bPtr + 16); | |
| 255 | 16380 | b3Val = _mm256_load_ps(bPtr + 24); | |
| 256 | |||
| 257 | 8190 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
| 258 | 8190 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
| 259 | 8190 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
| 260 | 8190 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
| 261 | |||
| 262 | 8190 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
| 263 | 8190 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
| 264 | 8190 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
| 265 | 8190 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
| 266 | |||
| 267 | 8190 | aPtr += 32; | |
| 268 | 8190 | bPtr += 32; | |
| 269 | } | ||
| 270 | |||
| 271 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 272 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 273 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 274 | |||
| 275 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 276 | |||
| 277 | _mm256_store_ps(dotProductVector, | ||
| 278 | dotProdVal0); // Store the results back into the dot product vector | ||
| 279 | |||
| 280 | 2 | dotProduct = dotProductVector[0]; | |
| 281 | 2 | dotProduct += dotProductVector[1]; | |
| 282 | 2 | dotProduct += dotProductVector[2]; | |
| 283 | 2 | dotProduct += dotProductVector[3]; | |
| 284 | 2 | dotProduct += dotProductVector[4]; | |
| 285 | 2 | dotProduct += dotProductVector[5]; | |
| 286 | 2 | dotProduct += dotProductVector[6]; | |
| 287 | 2 | dotProduct += dotProductVector[7]; | |
| 288 | |||
| 289 | 2 | number = thirtysecondPoints * 32; | |
| 290 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
| 291 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 292 | } | ||
| 293 | |||
| 294 | 2 | *result = (short)dotProduct; | |
| 295 | 2 | } | |
| 296 | |||
| 297 | #endif /*LV_HAVE_AVX*/ | ||
| 298 | |||
| 299 | #ifdef LV_HAVE_AVX512F | ||
| 300 | |||
| 301 | ✗ | static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, | |
| 302 | const float* input, | ||
| 303 | const float* taps, | ||
| 304 | unsigned int num_points) | ||
| 305 | { | ||
| 306 | |||
| 307 | ✗ | unsigned int number = 0; | |
| 308 | ✗ | const unsigned int sixtyfourthPoints = num_points / 64; | |
| 309 | |||
| 310 | ✗ | float dotProduct = 0; | |
| 311 | ✗ | const float* aPtr = input; | |
| 312 | ✗ | const float* bPtr = taps; | |
| 313 | |||
| 314 | __m512 a0Val, a1Val, a2Val, a3Val; | ||
| 315 | __m512 b0Val, b1Val, b2Val, b3Val; | ||
| 316 | |||
| 317 | ✗ | __m512 dotProdVal0 = _mm512_setzero_ps(); | |
| 318 | ✗ | __m512 dotProdVal1 = _mm512_setzero_ps(); | |
| 319 | ✗ | __m512 dotProdVal2 = _mm512_setzero_ps(); | |
| 320 | ✗ | __m512 dotProdVal3 = _mm512_setzero_ps(); | |
| 321 | |||
| 322 | ✗ | for (; number < sixtyfourthPoints; number++) { | |
| 323 | |||
| 324 | ✗ | a0Val = _mm512_load_ps(aPtr); | |
| 325 | ✗ | a1Val = _mm512_load_ps(aPtr + 16); | |
| 326 | ✗ | a2Val = _mm512_load_ps(aPtr + 32); | |
| 327 | ✗ | a3Val = _mm512_load_ps(aPtr + 48); | |
| 328 | ✗ | b0Val = _mm512_load_ps(bPtr); | |
| 329 | ✗ | b1Val = _mm512_load_ps(bPtr + 16); | |
| 330 | ✗ | b2Val = _mm512_load_ps(bPtr + 32); | |
| 331 | ✗ | b3Val = _mm512_load_ps(bPtr + 48); | |
| 332 | |||
| 333 | ✗ | dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
| 334 | ✗ | dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
| 335 | ✗ | dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
| 336 | ✗ | dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
| 337 | |||
| 338 | ✗ | aPtr += 64; | |
| 339 | ✗ | bPtr += 64; | |
| 340 | } | ||
| 341 | |||
| 342 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); | |
| 343 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); | |
| 344 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); | |
| 345 | |||
| 346 | __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; | ||
| 347 | |||
| 348 | _mm512_store_ps(dotProductVector, | ||
| 349 | dotProdVal0); // Store the results back into the dot product vector | ||
| 350 | |||
| 351 | ✗ | dotProduct = dotProductVector[0]; | |
| 352 | ✗ | dotProduct += dotProductVector[1]; | |
| 353 | ✗ | dotProduct += dotProductVector[2]; | |
| 354 | ✗ | dotProduct += dotProductVector[3]; | |
| 355 | ✗ | dotProduct += dotProductVector[4]; | |
| 356 | ✗ | dotProduct += dotProductVector[5]; | |
| 357 | ✗ | dotProduct += dotProductVector[6]; | |
| 358 | ✗ | dotProduct += dotProductVector[7]; | |
| 359 | ✗ | dotProduct += dotProductVector[8]; | |
| 360 | ✗ | dotProduct += dotProductVector[9]; | |
| 361 | ✗ | dotProduct += dotProductVector[10]; | |
| 362 | ✗ | dotProduct += dotProductVector[11]; | |
| 363 | ✗ | dotProduct += dotProductVector[12]; | |
| 364 | ✗ | dotProduct += dotProductVector[13]; | |
| 365 | ✗ | dotProduct += dotProductVector[14]; | |
| 366 | ✗ | dotProduct += dotProductVector[15]; | |
| 367 | |||
| 368 | ✗ | number = sixtyfourthPoints * 64; | |
| 369 | ✗ | for (; number < num_points; number++) { | |
| 370 | ✗ | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 371 | } | ||
| 372 | |||
| 373 | ✗ | *result = (short)dotProduct; | |
| 374 | ✗ | } | |
| 375 | |||
| 376 | #endif /*LV_HAVE_AVX512F*/ | ||
| 377 | |||
| 378 | |||
| 379 | #ifdef LV_HAVE_SSE | ||
| 380 | |||
| 381 | 2 | static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, | |
| 382 | const float* input, | ||
| 383 | const float* taps, | ||
| 384 | unsigned int num_points) | ||
| 385 | { | ||
| 386 | |||
| 387 | 2 | unsigned int number = 0; | |
| 388 | 2 | const unsigned int sixteenthPoints = num_points / 16; | |
| 389 | |||
| 390 | 2 | float dotProduct = 0; | |
| 391 | 2 | const float* aPtr = input; | |
| 392 | 2 | const float* bPtr = taps; | |
| 393 | |||
| 394 | __m128 a0Val, a1Val, a2Val, a3Val; | ||
| 395 | __m128 b0Val, b1Val, b2Val, b3Val; | ||
| 396 | __m128 c0Val, c1Val, c2Val, c3Val; | ||
| 397 | |||
| 398 | 2 | __m128 dotProdVal0 = _mm_setzero_ps(); | |
| 399 | 2 | __m128 dotProdVal1 = _mm_setzero_ps(); | |
| 400 | 2 | __m128 dotProdVal2 = _mm_setzero_ps(); | |
| 401 | 2 | __m128 dotProdVal3 = _mm_setzero_ps(); | |
| 402 | |||
| 403 |
2/2✓ Branch 0 taken 16382 times.
✓ Branch 1 taken 2 times.
|
16384 | for (; number < sixteenthPoints; number++) { |
| 404 | |||
| 405 | 16382 | a0Val = _mm_loadu_ps(aPtr); | |
| 406 | 16382 | a1Val = _mm_loadu_ps(aPtr + 4); | |
| 407 | 16382 | a2Val = _mm_loadu_ps(aPtr + 8); | |
| 408 | 32764 | a3Val = _mm_loadu_ps(aPtr + 12); | |
| 409 | 16382 | b0Val = _mm_loadu_ps(bPtr); | |
| 410 | 16382 | b1Val = _mm_loadu_ps(bPtr + 4); | |
| 411 | 16382 | b2Val = _mm_loadu_ps(bPtr + 8); | |
| 412 | 32764 | b3Val = _mm_loadu_ps(bPtr + 12); | |
| 413 | |||
| 414 | 16382 | c0Val = _mm_mul_ps(a0Val, b0Val); | |
| 415 | 16382 | c1Val = _mm_mul_ps(a1Val, b1Val); | |
| 416 | 16382 | c2Val = _mm_mul_ps(a2Val, b2Val); | |
| 417 | 16382 | c3Val = _mm_mul_ps(a3Val, b3Val); | |
| 418 | |||
| 419 | 16382 | dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); | |
| 420 | 16382 | dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); | |
| 421 | 16382 | dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); | |
| 422 | 16382 | dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); | |
| 423 | |||
| 424 | 16382 | aPtr += 16; | |
| 425 | 16382 | bPtr += 16; | |
| 426 | } | ||
| 427 | |||
| 428 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); | |
| 429 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); | |
| 430 | 2 | dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); | |
| 431 | |||
| 432 | __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; | ||
| 433 | |||
| 434 | _mm_store_ps(dotProductVector, | ||
| 435 | dotProdVal0); // Store the results back into the dot product vector | ||
| 436 | |||
| 437 | 2 | dotProduct = dotProductVector[0]; | |
| 438 | 2 | dotProduct += dotProductVector[1]; | |
| 439 | 2 | dotProduct += dotProductVector[2]; | |
| 440 | 2 | dotProduct += dotProductVector[3]; | |
| 441 | |||
| 442 | 2 | number = sixteenthPoints * 16; | |
| 443 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
32 | for (; number < num_points; number++) { |
| 444 | 30 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 445 | } | ||
| 446 | |||
| 447 | 2 | *result = (short)dotProduct; | |
| 448 | 2 | } | |
| 449 | |||
| 450 | #endif /*LV_HAVE_SSE*/ | ||
| 451 | |||
| 452 | |||
| 453 | #if LV_HAVE_AVX2 && LV_HAVE_FMA | ||
| 454 | |||
| 455 | 2 | static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, | |
| 456 | const float* input, | ||
| 457 | const float* taps, | ||
| 458 | unsigned int num_points) | ||
| 459 | { | ||
| 460 | |||
| 461 | 2 | unsigned int number = 0; | |
| 462 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
| 463 | |||
| 464 | 2 | float dotProduct = 0; | |
| 465 | 2 | const float* aPtr = input; | |
| 466 | 2 | const float* bPtr = taps; | |
| 467 | |||
| 468 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 469 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 470 | |||
| 471 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 472 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 473 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 474 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 475 | |||
| 476 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
| 477 | |||
| 478 | 8190 | a0Val = _mm256_loadu_ps(aPtr); | |
| 479 | 8190 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
| 480 | 8190 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
| 481 | 16380 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
| 482 | 8190 | b0Val = _mm256_loadu_ps(bPtr); | |
| 483 | 8190 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
| 484 | 8190 | b2Val = _mm256_loadu_ps(bPtr + 16); | |
| 485 | 16380 | b3Val = _mm256_loadu_ps(bPtr + 24); | |
| 486 | |||
| 487 | 8190 | dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
| 488 | 8190 | dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
| 489 | 8190 | dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
| 490 | 8190 | dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
| 491 | |||
| 492 | 8190 | aPtr += 32; | |
| 493 | 8190 | bPtr += 32; | |
| 494 | } | ||
| 495 | |||
| 496 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 497 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 498 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 499 | |||
| 500 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 501 | |||
| 502 | _mm256_store_ps(dotProductVector, | ||
| 503 | dotProdVal0); // Store the results back into the dot product vector | ||
| 504 | |||
| 505 | 2 | dotProduct = dotProductVector[0]; | |
| 506 | 2 | dotProduct += dotProductVector[1]; | |
| 507 | 2 | dotProduct += dotProductVector[2]; | |
| 508 | 2 | dotProduct += dotProductVector[3]; | |
| 509 | 2 | dotProduct += dotProductVector[4]; | |
| 510 | 2 | dotProduct += dotProductVector[5]; | |
| 511 | 2 | dotProduct += dotProductVector[6]; | |
| 512 | 2 | dotProduct += dotProductVector[7]; | |
| 513 | |||
| 514 | 2 | number = thirtysecondPoints * 32; | |
| 515 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
| 516 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 517 | } | ||
| 518 | |||
| 519 | 2 | *result = (short)dotProduct; | |
| 520 | 2 | } | |
| 521 | |||
| 522 | #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ | ||
| 523 | |||
| 524 | |||
| 525 | #ifdef LV_HAVE_AVX | ||
| 526 | |||
| 527 | 2 | static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, | |
| 528 | const float* input, | ||
| 529 | const float* taps, | ||
| 530 | unsigned int num_points) | ||
| 531 | { | ||
| 532 | |||
| 533 | 2 | unsigned int number = 0; | |
| 534 | 2 | const unsigned int thirtysecondPoints = num_points / 32; | |
| 535 | |||
| 536 | 2 | float dotProduct = 0; | |
| 537 | 2 | const float* aPtr = input; | |
| 538 | 2 | const float* bPtr = taps; | |
| 539 | |||
| 540 | __m256 a0Val, a1Val, a2Val, a3Val; | ||
| 541 | __m256 b0Val, b1Val, b2Val, b3Val; | ||
| 542 | __m256 c0Val, c1Val, c2Val, c3Val; | ||
| 543 | |||
| 544 | 2 | __m256 dotProdVal0 = _mm256_setzero_ps(); | |
| 545 | 2 | __m256 dotProdVal1 = _mm256_setzero_ps(); | |
| 546 | 2 | __m256 dotProdVal2 = _mm256_setzero_ps(); | |
| 547 | 2 | __m256 dotProdVal3 = _mm256_setzero_ps(); | |
| 548 | |||
| 549 |
2/2✓ Branch 0 taken 8190 times.
✓ Branch 1 taken 2 times.
|
8192 | for (; number < thirtysecondPoints; number++) { |
| 550 | |||
| 551 | 8190 | a0Val = _mm256_loadu_ps(aPtr); | |
| 552 | 8190 | a1Val = _mm256_loadu_ps(aPtr + 8); | |
| 553 | 8190 | a2Val = _mm256_loadu_ps(aPtr + 16); | |
| 554 | 16380 | a3Val = _mm256_loadu_ps(aPtr + 24); | |
| 555 | 8190 | b0Val = _mm256_loadu_ps(bPtr); | |
| 556 | 8190 | b1Val = _mm256_loadu_ps(bPtr + 8); | |
| 557 | 8190 | b2Val = _mm256_loadu_ps(bPtr + 16); | |
| 558 | 16380 | b3Val = _mm256_loadu_ps(bPtr + 24); | |
| 559 | |||
| 560 | 8190 | c0Val = _mm256_mul_ps(a0Val, b0Val); | |
| 561 | 8190 | c1Val = _mm256_mul_ps(a1Val, b1Val); | |
| 562 | 8190 | c2Val = _mm256_mul_ps(a2Val, b2Val); | |
| 563 | 8190 | c3Val = _mm256_mul_ps(a3Val, b3Val); | |
| 564 | |||
| 565 | 8190 | dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); | |
| 566 | 8190 | dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); | |
| 567 | 8190 | dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); | |
| 568 | 8190 | dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); | |
| 569 | |||
| 570 | 8190 | aPtr += 32; | |
| 571 | 8190 | bPtr += 32; | |
| 572 | } | ||
| 573 | |||
| 574 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); | |
| 575 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); | |
| 576 | 2 | dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); | |
| 577 | |||
| 578 | __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; | ||
| 579 | |||
| 580 | _mm256_store_ps(dotProductVector, | ||
| 581 | dotProdVal0); // Store the results back into the dot product vector | ||
| 582 | |||
| 583 | 2 | dotProduct = dotProductVector[0]; | |
| 584 | 2 | dotProduct += dotProductVector[1]; | |
| 585 | 2 | dotProduct += dotProductVector[2]; | |
| 586 | 2 | dotProduct += dotProductVector[3]; | |
| 587 | 2 | dotProduct += dotProductVector[4]; | |
| 588 | 2 | dotProduct += dotProductVector[5]; | |
| 589 | 2 | dotProduct += dotProductVector[6]; | |
| 590 | 2 | dotProduct += dotProductVector[7]; | |
| 591 | |||
| 592 | 2 | number = thirtysecondPoints * 32; | |
| 593 |
2/2✓ Branch 0 taken 62 times.
✓ Branch 1 taken 2 times.
|
64 | for (; number < num_points; number++) { |
| 594 | 62 | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 595 | } | ||
| 596 | |||
| 597 | 2 | *result = (short)dotProduct; | |
| 598 | 2 | } | |
| 599 | |||
| 600 | #endif /*LV_HAVE_AVX*/ | ||
| 601 | |||
| 602 | #ifdef LV_HAVE_AVX512F | ||
| 603 | |||
| 604 | ✗ | static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, | |
| 605 | const float* input, | ||
| 606 | const float* taps, | ||
| 607 | unsigned int num_points) | ||
| 608 | { | ||
| 609 | |||
| 610 | ✗ | unsigned int number = 0; | |
| 611 | ✗ | const unsigned int sixtyfourthPoints = num_points / 64; | |
| 612 | |||
| 613 | ✗ | float dotProduct = 0; | |
| 614 | ✗ | const float* aPtr = input; | |
| 615 | ✗ | const float* bPtr = taps; | |
| 616 | |||
| 617 | __m512 a0Val, a1Val, a2Val, a3Val; | ||
| 618 | __m512 b0Val, b1Val, b2Val, b3Val; | ||
| 619 | |||
| 620 | ✗ | __m512 dotProdVal0 = _mm512_setzero_ps(); | |
| 621 | ✗ | __m512 dotProdVal1 = _mm512_setzero_ps(); | |
| 622 | ✗ | __m512 dotProdVal2 = _mm512_setzero_ps(); | |
| 623 | ✗ | __m512 dotProdVal3 = _mm512_setzero_ps(); | |
| 624 | |||
| 625 | ✗ | for (; number < sixtyfourthPoints; number++) { | |
| 626 | |||
| 627 | ✗ | a0Val = _mm512_loadu_ps(aPtr); | |
| 628 | ✗ | a1Val = _mm512_loadu_ps(aPtr + 16); | |
| 629 | ✗ | a2Val = _mm512_loadu_ps(aPtr + 32); | |
| 630 | ✗ | a3Val = _mm512_loadu_ps(aPtr + 48); | |
| 631 | ✗ | b0Val = _mm512_loadu_ps(bPtr); | |
| 632 | ✗ | b1Val = _mm512_loadu_ps(bPtr + 16); | |
| 633 | ✗ | b2Val = _mm512_loadu_ps(bPtr + 32); | |
| 634 | ✗ | b3Val = _mm512_loadu_ps(bPtr + 48); | |
| 635 | |||
| 636 | ✗ | dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); | |
| 637 | ✗ | dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); | |
| 638 | ✗ | dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); | |
| 639 | ✗ | dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); | |
| 640 | |||
| 641 | ✗ | aPtr += 64; | |
| 642 | ✗ | bPtr += 64; | |
| 643 | } | ||
| 644 | |||
| 645 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); | |
| 646 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); | |
| 647 | ✗ | dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); | |
| 648 | |||
| 649 | __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; | ||
| 650 | |||
| 651 | _mm512_storeu_ps(dotProductVector, | ||
| 652 | dotProdVal0); // Store the results back into the dot product vector | ||
| 653 | |||
| 654 | ✗ | dotProduct = dotProductVector[0]; | |
| 655 | ✗ | dotProduct += dotProductVector[1]; | |
| 656 | ✗ | dotProduct += dotProductVector[2]; | |
| 657 | ✗ | dotProduct += dotProductVector[3]; | |
| 658 | ✗ | dotProduct += dotProductVector[4]; | |
| 659 | ✗ | dotProduct += dotProductVector[5]; | |
| 660 | ✗ | dotProduct += dotProductVector[6]; | |
| 661 | ✗ | dotProduct += dotProductVector[7]; | |
| 662 | ✗ | dotProduct += dotProductVector[8]; | |
| 663 | ✗ | dotProduct += dotProductVector[9]; | |
| 664 | ✗ | dotProduct += dotProductVector[10]; | |
| 665 | ✗ | dotProduct += dotProductVector[11]; | |
| 666 | ✗ | dotProduct += dotProductVector[12]; | |
| 667 | ✗ | dotProduct += dotProductVector[13]; | |
| 668 | ✗ | dotProduct += dotProductVector[14]; | |
| 669 | ✗ | dotProduct += dotProductVector[15]; | |
| 670 | |||
| 671 | ✗ | number = sixtyfourthPoints * 64; | |
| 672 | ✗ | for (; number < num_points; number++) { | |
| 673 | ✗ | dotProduct += ((*aPtr++) * (*bPtr++)); | |
| 674 | } | ||
| 675 | |||
| 676 | ✗ | *result = (short)dotProduct; | |
| 677 | ✗ | } | |
| 678 | |||
| 679 | #endif /*LV_HAVE_AVX512F*/ | ||
| 680 | |||
| 681 | |||
| 682 | #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/ | ||
| 683 |