| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* -*- c++ -*- */ | ||
| 2 | /* | ||
| 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. | ||
| 4 | * | ||
| 5 | * This file is part of VOLK | ||
| 6 | * | ||
| 7 | * SPDX-License-Identifier: LGPL-3.0-or-later | ||
| 8 | */ | ||
| 9 | |||
| 10 | #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H | ||
| 11 | #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H | ||
| 12 | |||
| 13 | #include <inttypes.h> | ||
| 14 | #include <stdio.h> | ||
| 15 | #include <volk/volk_complex.h> | ||
| 16 | |||
| 17 | #ifdef LV_HAVE_AVX2 | ||
| 18 | #include <immintrin.h> | ||
| 19 | /*! | ||
| 20 | \brief Multiplys the one complex vector with the complex conjugate of the second complex | ||
| 21 | vector and stores their results in the third vector \param cVector The complex vector | ||
| 22 | where the results will be stored \param aVector One of the complex vectors to be | ||
| 23 | multiplied \param bVector The complex vector which will be converted to complex | ||
| 24 | conjugate and multiplied \param num_points The number of complex values in aVector and | ||
| 25 | bVector to be multiplied together and stored into cVector | ||
| 26 | */ | ||
| 27 | 2 | static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, | |
| 28 | const lv_8sc_t* aVector, | ||
| 29 | const lv_8sc_t* bVector, | ||
| 30 | unsigned int num_points) | ||
| 31 | { | ||
| 32 | 2 | unsigned int number = 0; | |
| 33 | 2 | const unsigned int quarterPoints = num_points / 8; | |
| 34 | |||
| 35 | __m256i x, y, realz, imagz; | ||
| 36 | 2 | lv_16sc_t* c = cVector; | |
| 37 | 2 | const lv_8sc_t* a = aVector; | |
| 38 | 2 | const lv_8sc_t* b = bVector; | |
| 39 | __m256i conjugateSign = | ||
| 40 | 2 | _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); | |
| 41 | |||
| 42 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < quarterPoints; number++) { |
| 43 | // Convert 8 bit values into 16 bit values | ||
| 44 | 65532 | x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); | |
| 45 | 65532 | y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); | |
| 46 | |||
| 47 | // Calculate the ar*cr - ai*(-ci) portions | ||
| 48 | 32766 | realz = _mm256_madd_epi16(x, y); | |
| 49 | |||
| 50 | // Calculate the complex conjugate of the cr + ci j values | ||
| 51 | 32766 | y = _mm256_sign_epi16(y, conjugateSign); | |
| 52 | |||
| 53 | // Shift the order of the cr and ci values | ||
| 54 | 32766 | y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), | |
| 55 | _MM_SHUFFLE(2, 3, 0, 1)); | ||
| 56 | |||
| 57 | // Calculate the ar*(-ci) + cr*(ai) | ||
| 58 | 32766 | imagz = _mm256_madd_epi16(x, y); | |
| 59 | |||
| 60 | // Perform the addition of products | ||
| 61 | |||
| 62 | 98298 | _mm256_store_si256((__m256i*)c, | |
| 63 | _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), | ||
| 64 | _mm256_unpackhi_epi32(realz, imagz))); | ||
| 65 | |||
| 66 | 32766 | a += 8; | |
| 67 | 32766 | b += 8; | |
| 68 | 32766 | c += 8; | |
| 69 | } | ||
| 70 | |||
| 71 | 2 | number = quarterPoints * 8; | |
| 72 | 2 | int16_t* c16Ptr = (int16_t*)&cVector[number]; | |
| 73 | 2 | int8_t* a8Ptr = (int8_t*)&aVector[number]; | |
| 74 | 2 | int8_t* b8Ptr = (int8_t*)&bVector[number]; | |
| 75 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 76 | 14 | float aReal = (float)*a8Ptr++; | |
| 77 | 14 | float aImag = (float)*a8Ptr++; | |
| 78 | 14 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
| 79 | 14 | float bReal = (float)*b8Ptr++; | |
| 80 | 14 | float bImag = (float)*b8Ptr++; | |
| 81 | 14 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
| 82 | 14 | lv_32fc_t temp = aVal * bVal; | |
| 83 | |||
| 84 | 14 | *c16Ptr++ = (int16_t)lv_creal(temp); | |
| 85 | 14 | *c16Ptr++ = (int16_t)lv_cimag(temp); | |
| 86 | } | ||
| 87 | 2 | } | |
| 88 | #endif /* LV_HAVE_AVX2 */ | ||
| 89 | |||
| 90 | |||
| 91 | #ifdef LV_HAVE_SSE4_1 | ||
| 92 | #include <smmintrin.h> | ||
| 93 | /*! | ||
| 94 | \brief Multiplys the one complex vector with the complex conjugate of the second complex | ||
| 95 | vector and stores their results in the third vector \param cVector The complex vector | ||
| 96 | where the results will be stored \param aVector One of the complex vectors to be | ||
| 97 | multiplied \param bVector The complex vector which will be converted to complex | ||
| 98 | conjugate and multiplied \param num_points The number of complex values in aVector and | ||
| 99 | bVector to be multiplied together and stored into cVector | ||
| 100 | */ | ||
| 101 | 2 | static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, | |
| 102 | const lv_8sc_t* aVector, | ||
| 103 | const lv_8sc_t* bVector, | ||
| 104 | unsigned int num_points) | ||
| 105 | { | ||
| 106 | 2 | unsigned int number = 0; | |
| 107 | 2 | const unsigned int quarterPoints = num_points / 4; | |
| 108 | |||
| 109 | __m128i x, y, realz, imagz; | ||
| 110 | 2 | lv_16sc_t* c = cVector; | |
| 111 | 2 | const lv_8sc_t* a = aVector; | |
| 112 | 2 | const lv_8sc_t* b = bVector; | |
| 113 | 2 | __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); | |
| 114 | |||
| 115 |
2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.
|
65536 | for (; number < quarterPoints; number++) { |
| 116 | // Convert into 8 bit values into 16 bit values | ||
| 117 | 131068 | x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); | |
| 118 | 131068 | y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); | |
| 119 | |||
| 120 | // Calculate the ar*cr - ai*(-ci) portions | ||
| 121 | 65534 | realz = _mm_madd_epi16(x, y); | |
| 122 | |||
| 123 | // Calculate the complex conjugate of the cr + ci j values | ||
| 124 | 65534 | y = _mm_sign_epi16(y, conjugateSign); | |
| 125 | |||
| 126 | // Shift the order of the cr and ci values | ||
| 127 | 65534 | y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), | |
| 128 | _MM_SHUFFLE(2, 3, 0, 1)); | ||
| 129 | |||
| 130 | // Calculate the ar*(-ci) + cr*(ai) | ||
| 131 | 65534 | imagz = _mm_madd_epi16(x, y); | |
| 132 | |||
| 133 | 196602 | _mm_store_si128((__m128i*)c, | |
| 134 | _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), | ||
| 135 | _mm_unpackhi_epi32(realz, imagz))); | ||
| 136 | |||
| 137 | 65534 | a += 4; | |
| 138 | 65534 | b += 4; | |
| 139 | 65534 | c += 4; | |
| 140 | } | ||
| 141 | |||
| 142 | 2 | number = quarterPoints * 4; | |
| 143 | 2 | int16_t* c16Ptr = (int16_t*)&cVector[number]; | |
| 144 | 2 | int8_t* a8Ptr = (int8_t*)&aVector[number]; | |
| 145 | 2 | int8_t* b8Ptr = (int8_t*)&bVector[number]; | |
| 146 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.
|
8 | for (; number < num_points; number++) { |
| 147 | 6 | float aReal = (float)*a8Ptr++; | |
| 148 | 6 | float aImag = (float)*a8Ptr++; | |
| 149 | 6 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
| 150 | 6 | float bReal = (float)*b8Ptr++; | |
| 151 | 6 | float bImag = (float)*b8Ptr++; | |
| 152 | 6 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
| 153 | 6 | lv_32fc_t temp = aVal * bVal; | |
| 154 | |||
| 155 | 6 | *c16Ptr++ = (int16_t)lv_creal(temp); | |
| 156 | 6 | *c16Ptr++ = (int16_t)lv_cimag(temp); | |
| 157 | } | ||
| 158 | 2 | } | |
| 159 | #endif /* LV_HAVE_SSE4_1 */ | ||
| 160 | |||
| 161 | #ifdef LV_HAVE_GENERIC | ||
| 162 | /*! | ||
| 163 | \brief Multiplys the one complex vector with the complex conjugate of the second complex | ||
| 164 | vector and stores their results in the third vector \param cVector The complex vector | ||
| 165 | where the results will be stored \param aVector One of the complex vectors to be | ||
| 166 | multiplied \param bVector The complex vector which will be converted to complex | ||
| 167 | conjugate and multiplied \param num_points The number of complex values in aVector and | ||
| 168 | bVector to be multiplied together and stored into cVector | ||
| 169 | */ | ||
| 170 | 2 | static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, | |
| 171 | const lv_8sc_t* aVector, | ||
| 172 | const lv_8sc_t* bVector, | ||
| 173 | unsigned int num_points) | ||
| 174 | { | ||
| 175 | 2 | unsigned int number = 0; | |
| 176 | 2 | int16_t* c16Ptr = (int16_t*)cVector; | |
| 177 | 2 | int8_t* a8Ptr = (int8_t*)aVector; | |
| 178 | 2 | int8_t* b8Ptr = (int8_t*)bVector; | |
| 179 |
2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.
|
262144 | for (number = 0; number < num_points; number++) { |
| 180 | 262142 | float aReal = (float)*a8Ptr++; | |
| 181 | 262142 | float aImag = (float)*a8Ptr++; | |
| 182 | 262142 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
| 183 | 262142 | float bReal = (float)*b8Ptr++; | |
| 184 | 262142 | float bImag = (float)*b8Ptr++; | |
| 185 | 262142 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
| 186 | 262142 | lv_32fc_t temp = aVal * bVal; | |
| 187 | |||
| 188 | 262142 | *c16Ptr++ = (int16_t)lv_creal(temp); | |
| 189 | 262142 | *c16Ptr++ = (int16_t)lv_cimag(temp); | |
| 190 | } | ||
| 191 | 2 | } | |
| 192 | #endif /* LV_HAVE_GENERIC */ | ||
| 193 | |||
| 194 | #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */ | ||
| 195 | |||
| 196 | #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H | ||
| 197 | #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H | ||
| 198 | |||
| 199 | #include <inttypes.h> | ||
| 200 | #include <stdio.h> | ||
| 201 | #include <volk/volk_complex.h> | ||
| 202 | |||
| 203 | #ifdef LV_HAVE_AVX2 | ||
| 204 | #include <immintrin.h> | ||
| 205 | /*! | ||
| 206 | \brief Multiplys the one complex vector with the complex conjugate of the second complex | ||
| 207 | vector and stores their results in the third vector \param cVector The complex vector | ||
| 208 | where the results will be stored \param aVector One of the complex vectors to be | ||
| 209 | multiplied \param bVector The complex vector which will be converted to complex | ||
| 210 | conjugate and multiplied \param num_points The number of complex values in aVector and | ||
| 211 | bVector to be multiplied together and stored into cVector | ||
| 212 | */ | ||
| 213 | 2 | static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, | |
| 214 | const lv_8sc_t* aVector, | ||
| 215 | const lv_8sc_t* bVector, | ||
| 216 | unsigned int num_points) | ||
| 217 | { | ||
| 218 | 2 | unsigned int number = 0; | |
| 219 | 2 | const unsigned int oneEigthPoints = num_points / 8; | |
| 220 | |||
| 221 | __m256i x, y, realz, imagz; | ||
| 222 | 2 | lv_16sc_t* c = cVector; | |
| 223 | 2 | const lv_8sc_t* a = aVector; | |
| 224 | 2 | const lv_8sc_t* b = bVector; | |
| 225 | __m256i conjugateSign = | ||
| 226 | 2 | _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); | |
| 227 | |||
| 228 |
2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.
|
32768 | for (; number < oneEigthPoints; number++) { |
| 229 | // Convert 8 bit values into 16 bit values | ||
| 230 | 65532 | x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); | |
| 231 | 65532 | y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); | |
| 232 | |||
| 233 | // Calculate the ar*cr - ai*(-ci) portions | ||
| 234 | 32766 | realz = _mm256_madd_epi16(x, y); | |
| 235 | |||
| 236 | // Calculate the complex conjugate of the cr + ci j values | ||
| 237 | 32766 | y = _mm256_sign_epi16(y, conjugateSign); | |
| 238 | |||
| 239 | // Shift the order of the cr and ci values | ||
| 240 | 32766 | y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), | |
| 241 | _MM_SHUFFLE(2, 3, 0, 1)); | ||
| 242 | |||
| 243 | // Calculate the ar*(-ci) + cr*(ai) | ||
| 244 | 32766 | imagz = _mm256_madd_epi16(x, y); | |
| 245 | |||
| 246 | // Perform the addition of products | ||
| 247 | |||
| 248 | 98298 | _mm256_storeu_si256((__m256i*)c, | |
| 249 | _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), | ||
| 250 | _mm256_unpackhi_epi32(realz, imagz))); | ||
| 251 | |||
| 252 | 32766 | a += 8; | |
| 253 | 32766 | b += 8; | |
| 254 | 32766 | c += 8; | |
| 255 | } | ||
| 256 | |||
| 257 | 2 | number = oneEigthPoints * 8; | |
| 258 | 2 | int16_t* c16Ptr = (int16_t*)&cVector[number]; | |
| 259 | 2 | int8_t* a8Ptr = (int8_t*)&aVector[number]; | |
| 260 | 2 | int8_t* b8Ptr = (int8_t*)&bVector[number]; | |
| 261 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.
|
16 | for (; number < num_points; number++) { |
| 262 | 14 | float aReal = (float)*a8Ptr++; | |
| 263 | 14 | float aImag = (float)*a8Ptr++; | |
| 264 | 14 | lv_32fc_t aVal = lv_cmake(aReal, aImag); | |
| 265 | 14 | float bReal = (float)*b8Ptr++; | |
| 266 | 14 | float bImag = (float)*b8Ptr++; | |
| 267 | 14 | lv_32fc_t bVal = lv_cmake(bReal, -bImag); | |
| 268 | 14 | lv_32fc_t temp = aVal * bVal; | |
| 269 | |||
| 270 | 14 | *c16Ptr++ = (int16_t)lv_creal(temp); | |
| 271 | 14 | *c16Ptr++ = (int16_t)lv_cimag(temp); | |
| 272 | } | ||
| 273 | 2 | } | |
| 274 | #endif /* LV_HAVE_AVX2 */ | ||
| 275 | |||
| 276 | #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */ | ||
| 277 |