GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_16ic_x2_dot_prod_16ic.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	158	158	100.0%
Functions:	5	5	100.0%
Branches:	30	34	88.2%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2016 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_16ic_x2_dot_prod_16ic
    
       *
    
       * \b Overview
    
       *
    
       * Multiplies two input complex vectors (16-bit integer each component) and accumulates
    
       * them, storing the result. Results are saturated so never go beyond the limits of the
    
       * data type.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
    
       * lv_16sc_t* in_b, unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li in_a:          One of the vectors to be multiplied and accumulated.
    
       * \li in_b:          The other vector to be multiplied and accumulated.
    
       * \li num_points:    Number of complex values to be multiplied together, accumulated and
    
       * stored into \p result
    
       *
    
       * \b Outputs
    
       * \li result:        Value of the accumulated result.
    
       *
    
       */
    
      #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
    
      #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
    
      #include <volk/saturation_arithmetic.h>
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
    
                                                            const lv_16sc_t* in_a,
    
                                                            const lv_16sc_t* in_b,
    
                                                            unsigned int num_points)
    
      {
    
      2
          result[0] = lv_cmake((int16_t)0, (int16_t)0);
    
          unsigned int n;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (n = 0; n < num_points; n++) {
    
      262142
              lv_16sc_t tmp = in_a[n] * in_b[n];
    
      262142
              result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
    
                                   sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
    
          }
    
      2
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
    
      2
          const unsigned int sse_iters = num_points / 4;
    
          unsigned int number;
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (sse_iters > 0) {
    
              __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
    
                  realcacc, imagcacc;
    
              __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
    
      2
              realcacc = _mm_setzero_si128();
    
      2
              imagcacc = _mm_setzero_si128();
    
      2
              mask_imag = _mm_set_epi8(
    
                  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
    
      2
              mask_real = _mm_set_epi8(
    
                  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
              for (number = 0; number < sse_iters; number++) {
    
                  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
    
      65534
                  a = _mm_load_si128(
    
                      (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
    
      65534
                  __VOLK_PREFETCH(_in_a + 8);
    
      65534
                  b = _mm_load_si128((__m128i*)_in_b);
    
      65534
                  __VOLK_PREFETCH(_in_b + 8);
    
      65534
                  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
    
      65534
                  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
    
                                               // zeros, and store the results in dst.
    
      65534
                  real = _mm_subs_epi16(c, c_sr);
    
      65534
                  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
    
      65534
                  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
    
      65534
                  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
    
      65534
                  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
    
      65534
                  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
    
      65534
                  realcacc = _mm_adds_epi16(realcacc, real);
    
      65534
                  imagcacc = _mm_adds_epi16(imagcacc, imag);
    
      65534
                  _in_a += 4;
    
      65534
                  _in_b += 4;
    
              }
    
      2
              realcacc = _mm_and_si128(realcacc, mask_real);
    
      2
              imagcacc = _mm_and_si128(imagcacc, mask_imag);
    
      2
              a = _mm_or_si128(realcacc, imagcacc);
    
              _mm_store_si128((__m128i*)dotProductVector,
    
                              a); // Store the results back into the dot product vector
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.

      10
              for (number = 0; number < 4; ++number) {
    
      8
                  dotProduct = lv_cmake(
    
                      sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
    
                      sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
    
              }
    
          }
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (number = 0; number < (num_points % 4); ++number) {
    
      6
              lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
    
      6
              dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
    
                                    sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
    
          }
    
      2
          *_out = dotProduct;
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
    
      2
          const unsigned int sse_iters = num_points / 4;
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          unsigned int number;
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (sse_iters > 0) {
    
              __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
    
                  realcacc, imagcacc, result;
    
              __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
    
      2
              realcacc = _mm_setzero_si128();
    
      2
              imagcacc = _mm_setzero_si128();
    
      2
              mask_imag = _mm_set_epi8(
    
                  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
    
      2
              mask_real = _mm_set_epi8(
    
                  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
              for (number = 0; number < sse_iters; number++) {
    
                  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
    
      65534
                  a = _mm_loadu_si128(
    
                      (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
    
      65534
                  __VOLK_PREFETCH(_in_a + 8);
    
      65534
                  b = _mm_loadu_si128((__m128i*)_in_b);
    
      65534
                  __VOLK_PREFETCH(_in_b + 8);
    
      65534
                  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
    
      65534
                  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
    
                                               // zeros, and store the results in dst.
    
      65534
                  real = _mm_subs_epi16(c, c_sr);
    
      65534
                  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
    
      65534
                  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
    
      65534
                  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
    
      65534
                  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
    
      65534
                  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
    
      65534
                  realcacc = _mm_adds_epi16(realcacc, real);
    
      65534
                  imagcacc = _mm_adds_epi16(imagcacc, imag);
    
      65534
                  _in_a += 4;
    
      65534
                  _in_b += 4;
    
              }
    
      2
              realcacc = _mm_and_si128(realcacc, mask_real);
    
      2
              imagcacc = _mm_and_si128(imagcacc, mask_imag);
    
      2
              result = _mm_or_si128(realcacc, imagcacc);
    
              _mm_storeu_si128((__m128i*)dotProductVector,
    
                               result); // Store the results back into the dot product vector
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 2 times.

      10
              for (number = 0; number < 4; ++number) {
    
      8
                  dotProduct = lv_cmake(
    
                      sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
    
                      sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
    
              }
    
          }
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (number = 0; number < (num_points % 4); ++number) {
    
      6
              lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
    
      6
              dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
    
                                    sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
    
          }
    
      2
          *_out = dotProduct;
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
    
      2
          const unsigned int avx_iters = num_points / 8;
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          unsigned int number;
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (avx_iters > 0) {
    
              __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
    
                  realcacc, imagcacc, result;
    
              __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
    
      2
              realcacc = _mm256_setzero_si256();
    
      2
              imagcacc = _mm256_setzero_si256();
    
      2
              mask_imag = _mm256_set_epi8(0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0);
    
      2
              mask_real = _mm256_set_epi8(0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
              for (number = 0; number < avx_iters; number++) {
    
      32766
                  a = _mm256_loadu_si256((__m256i*)_in_a);
    
      32766
                  __VOLK_PREFETCH(_in_a + 16);
    
      32766
                  b = _mm256_loadu_si256((__m256i*)_in_b);
    
      32766
                  __VOLK_PREFETCH(_in_b + 16);
    
      32766
                  c = _mm256_mullo_epi16(a, b);
    
      32766
                  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
    
                                                  // in zeros, and store the results in dst.
    
      32766
                  real = _mm256_subs_epi16(c, c_sr);
    
      32766
                  b_sl = _mm256_slli_si256(b, 2);
    
      32766
                  a_sl = _mm256_slli_si256(a, 2);
    
      32766
                  imag1 = _mm256_mullo_epi16(a, b_sl);
    
      32766
                  imag2 = _mm256_mullo_epi16(b, a_sl);
    
      32766
                  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
    
      32766
                  realcacc = _mm256_adds_epi16(realcacc, real);
    
      32766
                  imagcacc = _mm256_adds_epi16(imagcacc, imag);
    
      32766
                  _in_a += 8;
    
      32766
                  _in_b += 8;
    
              }
    
      2
              realcacc = _mm256_and_si256(realcacc, mask_real);
    
      2
              imagcacc = _mm256_and_si256(imagcacc, mask_imag);
    
      2
              result = _mm256_or_si256(realcacc, imagcacc);
    
              _mm256_storeu_si256((__m256i*)dotProductVector,
    
                                  result); // Store the results back into the dot product vector
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
              for (number = 0; number < 8; ++number) {
    
      16
                  dotProduct = lv_cmake(
    
                      sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
    
                      sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
    
              }
    
          }
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (number = 0; number < (num_points % 8); ++number) {
    
      14
              lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
    
      14
              dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
    
                                    sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
    
          }
    
      2
          *_out = dotProduct;
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
    
      2
          const unsigned int avx_iters = num_points / 8;
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          unsigned int number;
    
        1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.

      2
          if (avx_iters > 0) {
    
              __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
    
                  realcacc, imagcacc, result;
    
              __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
    
      2
              realcacc = _mm256_setzero_si256();
    
      2
              imagcacc = _mm256_setzero_si256();
    
      2
              mask_imag = _mm256_set_epi8(0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0);
    
      2
              mask_real = _mm256_set_epi8(0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF,
    
                                          0,
    
                                          0,
    
                                          0xFF,
    
                                          0xFF);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
              for (number = 0; number < avx_iters; number++) {
    
      32766
                  a = _mm256_load_si256((__m256i*)_in_a);
    
      32766
                  __VOLK_PREFETCH(_in_a + 16);
    
      32766
                  b = _mm256_load_si256((__m256i*)_in_b);
    
      32766
                  __VOLK_PREFETCH(_in_b + 16);
    
      32766
                  c = _mm256_mullo_epi16(a, b);
    
      32766
                  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
    
                                                  // in zeros, and store the results in dst.
    
      32766
                  real = _mm256_subs_epi16(c, c_sr);
    
      32766
                  b_sl = _mm256_slli_si256(b, 2);
    
      32766
                  a_sl = _mm256_slli_si256(a, 2);
    
      32766
                  imag1 = _mm256_mullo_epi16(a, b_sl);
    
      32766
                  imag2 = _mm256_mullo_epi16(b, a_sl);
    
      32766
                  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
    
      32766
                  realcacc = _mm256_adds_epi16(realcacc, real);
    
      32766
                  imagcacc = _mm256_adds_epi16(imagcacc, imag);
    
      32766
                  _in_a += 8;
    
      32766
                  _in_b += 8;
    
              }
    
      2
              realcacc = _mm256_and_si256(realcacc, mask_real);
    
      2
              imagcacc = _mm256_and_si256(imagcacc, mask_imag);
    
      2
              result = _mm256_or_si256(realcacc, imagcacc);
    
              _mm256_store_si256((__m256i*)dotProductVector,
    
                                 result); // Store the results back into the dot product vector
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.

      18
              for (number = 0; number < 8; ++number) {
    
      16
                  dotProduct = lv_cmake(
    
                      sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
    
                      sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
    
              }
    
          }
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (number = 0; number < (num_points % 8); ++number) {
    
      14
              lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
    
      14
              dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
    
                                    sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
    
          }
    
      2
          *_out = dotProduct;
    
      2
      }
    
      #endif /* LV_HAVE_AVX2 */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
    
                                                         const lv_16sc_t* in_a,
    
                                                         const lv_16sc_t* in_b,
    
                                                         unsigned int num_points)
    
      {
    
          unsigned int quarter_points = num_points / 4;
    
          unsigned int number;
    
          lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
    
          lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
    
          *out = lv_cmake((int16_t)0, (int16_t)0);
    
          if (quarter_points > 0) {
    
              // for 2-lane vectors, 1st lane holds the real part,
    
              // 2nd lane holds the imaginary part
    
              int16x4x2_t a_val, b_val, c_val, accumulator;
    
              int16x4x2_t tmp_real, tmp_imag;
    
              __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
    
              accumulator.val[0] = vdup_n_s16(0);
    
              accumulator.val[1] = vdup_n_s16(0);
    
              lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
    
              for (number = 0; number < quarter_points; ++number) {
    
                  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
                  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
                  __VOLK_PREFETCH(a_ptr + 8);
    
                  __VOLK_PREFETCH(b_ptr + 8);
    
                  // multiply the real*real and imag*imag to get real result
    
                  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
    
                  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
    
                  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
    
                  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
    
                  // Multiply cross terms to get the imaginary result
    
                  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
    
                  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
    
                  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
    
                  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
    
                  c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
    
                  c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
    
                  accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
    
                  accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
    
                  a_ptr += 4;
    
                  b_ptr += 4;
    
              }
    
              vst2_s16((int16_t*)accum_result, accumulator);
    
              for (number = 0; number < 4; ++number) {
    
                  dotProduct = lv_cmake(
    
                      sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
    
                      sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
    
              }
    
              *out = dotProduct;
    
          }
    
          // tail case
    
          for (number = quarter_points * 4; number < num_points; ++number) {
    
              *out += (*a_ptr++) * (*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
    
                                                             const lv_16sc_t* in_a,
    
                                                             const lv_16sc_t* in_b,
    
                                                             unsigned int num_points)
    
      {
    
          unsigned int quarter_points = num_points / 4;
    
          unsigned int number;
    
          lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
    
          lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
    
          // for 2-lane vectors, 1st lane holds the real part,
    
          // 2nd lane holds the imaginary part
    
          int16x4x2_t a_val, b_val, accumulator;
    
          int16x4x2_t tmp;
    
          __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
    
          accumulator.val[0] = vdup_n_s16(0);
    
          accumulator.val[1] = vdup_n_s16(0);
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
              b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
              __VOLK_PREFETCH(a_ptr + 8);
    
              __VOLK_PREFETCH(b_ptr + 8);
    
              tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
    
              tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
    
              // use multiply accumulate/subtract to get result
    
              tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
    
              tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
    
              accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
    
              accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
    
              a_ptr += 4;
    
              b_ptr += 4;
    
          }
    
          vst2_s16((int16_t*)accum_result, accumulator);
    
          *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
    
          // tail case
    
          for (number = quarter_points * 4; number < num_points; ++number) {
    
              *out += (*a_ptr++) * (*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
    
                                                                const lv_16sc_t* in_a,
    
                                                                const lv_16sc_t* in_b,
    
                                                                unsigned int num_points)
    
      {
    
          unsigned int quarter_points = num_points / 4;
    
          unsigned int number;
    
          lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
    
          lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
    
          // for 2-lane vectors, 1st lane holds the real part,
    
          // 2nd lane holds the imaginary part
    
          int16x4x2_t a_val, b_val, accumulator1, accumulator2;
    
          __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
    
          accumulator1.val[0] = vdup_n_s16(0);
    
          accumulator1.val[1] = vdup_n_s16(0);
    
          accumulator2.val[0] = vdup_n_s16(0);
    
          accumulator2.val[1] = vdup_n_s16(0);
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
              b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
              __VOLK_PREFETCH(a_ptr + 8);
    
              __VOLK_PREFETCH(b_ptr + 8);
    
              // use 2 accumulators to remove inter-instruction data dependencies
    
              accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
    
              accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
    
              accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
    
              accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
    
              a_ptr += 4;
    
              b_ptr += 4;
    
          }
    
          accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
    
          accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
    
          vst2_s16((int16_t*)accum_result, accumulator1);
    
          *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
    
          // tail case
    
          for (number = quarter_points * 4; number < num_points; ++number) {
    
              *out += (*a_ptr++) * (*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2016 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_16ic_x2_dot_prod_16ic
12			*
13			* \b Overview
14			*
15			* Multiplies two input complex vectors (16-bit integer each component) and accumulates
16			* them, storing the result. Results are saturated so never go beyond the limits of the
17			* data type.
18			*
19			* <b>Dispatcher Prototype</b>
20			* \code
21			* void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
22			* lv_16sc_t* in_b, unsigned int num_points); \endcode
23			*
24			* \b Inputs
25			* \li in_a: One of the vectors to be multiplied and accumulated.
26			* \li in_b: The other vector to be multiplied and accumulated.
27			* \li num_points: Number of complex values to be multiplied together, accumulated and
28			* stored into \p result
29			*
30			* \b Outputs
31			* \li result: Value of the accumulated result.
32			*
33			*/
34
35			#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
36			#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
37
38			#include <volk/saturation_arithmetic.h>
39			#include <volk/volk_common.h>
40			#include <volk/volk_complex.h>
41
42
43			#ifdef LV_HAVE_GENERIC
44
45		2	static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
46			const lv_16sc_t* in_a,
47			const lv_16sc_t* in_b,
48			unsigned int num_points)
49			{
50		2	result[0] = lv_cmake((int16_t)0, (int16_t)0);
51			unsigned int n;
52	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (n = 0; n < num_points; n++) {
53		262142	lv_16sc_t tmp = in_a[n] * in_b[n];
54		262142	result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
55			sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
56			}
57		2	}
58
59			#endif /LV_HAVE_GENERIC/
60
61
62			#ifdef LV_HAVE_SSE2
63			#include <emmintrin.h>
64
65		2	static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
66			const lv_16sc_t* in_a,
67			const lv_16sc_t* in_b,
68			unsigned int num_points)
69			{
70		2	lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
71
72		2	const unsigned int sse_iters = num_points / 4;
73			unsigned int number;
74
75		2	const lv_16sc_t* _in_a = in_a;
76		2	const lv_16sc_t* _in_b = in_b;
77		2	lv_16sc_t* _out = out;
78
79	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (sse_iters > 0) {
80			__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
81			realcacc, imagcacc;
82			__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
83
84		2	realcacc = _mm_setzero_si128();
85		2	imagcacc = _mm_setzero_si128();
86
87		2	mask_imag = _mm_set_epi8(
88			0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
89		2	mask_real = _mm_set_epi8(
90			0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
91
92	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (number = 0; number < sse_iters; number++) {
93			// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
94		65534	a = _mm_load_si128(
95			(__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
96		65534	__VOLK_PREFETCH(_in_a + 8);
97		65534	b = _mm_load_si128((__m128i*)_in_b);
98		65534	__VOLK_PREFETCH(_in_b + 8);
99		65534	c = _mm_mullo_epi16(a, b); // a3.ib3.i, a3.rb3.r, ....
100
101		65534	c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
102			// zeros, and store the results in dst.
103		65534	real = _mm_subs_epi16(c, c_sr);
104
105		65534	b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
106		65534	a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
107
108		65534	imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
109		65534	imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
110
111		65534	imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
112
113		65534	realcacc = _mm_adds_epi16(realcacc, real);
114		65534	imagcacc = _mm_adds_epi16(imagcacc, imag);
115
116		65534	_in_a += 4;
117		65534	_in_b += 4;
118			}
119
120		2	realcacc = _mm_and_si128(realcacc, mask_real);
121		2	imagcacc = _mm_and_si128(imagcacc, mask_imag);
122
123		2	a = _mm_or_si128(realcacc, imagcacc);
124
125			_mm_store_si128((__m128i*)dotProductVector,
126			a); // Store the results back into the dot product vector
127
128	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 2 times.	10	for (number = 0; number < 4; ++number) {
129		8	dotProduct = lv_cmake(
130			sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
131			sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
132			}
133			}
134
135	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (number = 0; number < (num_points % 4); ++number) {
136		6	lv_16sc_t tmp = (_in_a++) (*_in_b++);
137		6	dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
138			sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
139			}
140
141		2	*_out = dotProduct;
142		2	}
143
144			#endif /* LV_HAVE_SSE2 */
145
146
147			#ifdef LV_HAVE_SSE2
148			#include <emmintrin.h>
149
150		2	static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
151			const lv_16sc_t* in_a,
152			const lv_16sc_t* in_b,
153			unsigned int num_points)
154			{
155		2	lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
156
157		2	const unsigned int sse_iters = num_points / 4;
158
159		2	const lv_16sc_t* _in_a = in_a;
160		2	const lv_16sc_t* _in_b = in_b;
161		2	lv_16sc_t* _out = out;
162			unsigned int number;
163
164	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (sse_iters > 0) {
165			__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
166			realcacc, imagcacc, result;
167			__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
168
169		2	realcacc = _mm_setzero_si128();
170		2	imagcacc = _mm_setzero_si128();
171
172		2	mask_imag = _mm_set_epi8(
173			0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
174		2	mask_real = _mm_set_epi8(
175			0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
176
177	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (number = 0; number < sse_iters; number++) {
178			// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
179		65534	a = _mm_loadu_si128(
180			(__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
181		65534	__VOLK_PREFETCH(_in_a + 8);
182		65534	b = _mm_loadu_si128((__m128i*)_in_b);
183		65534	__VOLK_PREFETCH(_in_b + 8);
184		65534	c = _mm_mullo_epi16(a, b); // a3.ib3.i, a3.rb3.r, ....
185
186		65534	c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
187			// zeros, and store the results in dst.
188		65534	real = _mm_subs_epi16(c, c_sr);
189
190		65534	b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
191		65534	a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
192
193		65534	imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
194		65534	imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
195
196		65534	imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
197
198		65534	realcacc = _mm_adds_epi16(realcacc, real);
199		65534	imagcacc = _mm_adds_epi16(imagcacc, imag);
200
201		65534	_in_a += 4;
202		65534	_in_b += 4;
203			}
204
205		2	realcacc = _mm_and_si128(realcacc, mask_real);
206		2	imagcacc = _mm_and_si128(imagcacc, mask_imag);
207
208		2	result = _mm_or_si128(realcacc, imagcacc);
209
210			_mm_storeu_si128((__m128i*)dotProductVector,
211			result); // Store the results back into the dot product vector
212
213	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 2 times.	10	for (number = 0; number < 4; ++number) {
214		8	dotProduct = lv_cmake(
215			sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
216			sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
217			}
218			}
219
220	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (number = 0; number < (num_points % 4); ++number) {
221		6	lv_16sc_t tmp = (_in_a++) (*_in_b++);
222		6	dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
223			sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
224			}
225
226		2	*_out = dotProduct;
227		2	}
228			#endif /* LV_HAVE_SSE2 */
229
230
231			#ifdef LV_HAVE_AVX2
232			#include <immintrin.h>
233
234		2	static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
235			const lv_16sc_t* in_a,
236			const lv_16sc_t* in_b,
237			unsigned int num_points)
238			{
239		2	lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
240
241		2	const unsigned int avx_iters = num_points / 8;
242
243		2	const lv_16sc_t* _in_a = in_a;
244		2	const lv_16sc_t* _in_b = in_b;
245		2	lv_16sc_t* _out = out;
246			unsigned int number;
247
248	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (avx_iters > 0) {
249			__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
250			realcacc, imagcacc, result;
251			__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
252
253		2	realcacc = _mm256_setzero_si256();
254		2	imagcacc = _mm256_setzero_si256();
255
256		2	mask_imag = _mm256_set_epi8(0xFF,
257			0xFF,
258			0,
259			0,
260			0xFF,
261			0xFF,
262			0,
263			0,
264			0xFF,
265			0xFF,
266			0,
267			0,
268			0xFF,
269			0xFF,
270			0,
271			0,
272			0xFF,
273			0xFF,
274			0,
275			0,
276			0xFF,
277			0xFF,
278			0,
279			0,
280			0xFF,
281			0xFF,
282			0,
283			0,
284			0xFF,
285			0xFF,
286			0,
287			0);
288		2	mask_real = _mm256_set_epi8(0,
289			0,
290			0xFF,
291			0xFF,
292			0,
293			0,
294			0xFF,
295			0xFF,
296			0,
297			0,
298			0xFF,
299			0xFF,
300			0,
301			0,
302			0xFF,
303			0xFF,
304			0,
305			0,
306			0xFF,
307			0xFF,
308			0,
309			0,
310			0xFF,
311			0xFF,
312			0,
313			0,
314			0xFF,
315			0xFF,
316			0,
317			0,
318			0xFF,
319			0xFF);
320
321	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (number = 0; number < avx_iters; number++) {
322		32766	a = _mm256_loadu_si256((__m256i*)_in_a);
323		32766	__VOLK_PREFETCH(_in_a + 16);
324		32766	b = _mm256_loadu_si256((__m256i*)_in_b);
325		32766	__VOLK_PREFETCH(_in_b + 16);
326		32766	c = _mm256_mullo_epi16(a, b);
327
328		32766	c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
329			// in zeros, and store the results in dst.
330		32766	real = _mm256_subs_epi16(c, c_sr);
331
332		32766	b_sl = _mm256_slli_si256(b, 2);
333		32766	a_sl = _mm256_slli_si256(a, 2);
334
335		32766	imag1 = _mm256_mullo_epi16(a, b_sl);
336		32766	imag2 = _mm256_mullo_epi16(b, a_sl);
337
338		32766	imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
339
340		32766	realcacc = _mm256_adds_epi16(realcacc, real);
341		32766	imagcacc = _mm256_adds_epi16(imagcacc, imag);
342
343		32766	_in_a += 8;
344		32766	_in_b += 8;
345			}
346
347		2	realcacc = _mm256_and_si256(realcacc, mask_real);
348		2	imagcacc = _mm256_and_si256(imagcacc, mask_imag);
349
350		2	result = _mm256_or_si256(realcacc, imagcacc);
351
352			_mm256_storeu_si256((__m256i*)dotProductVector,
353			result); // Store the results back into the dot product vector
354
355	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (number = 0; number < 8; ++number) {
356		16	dotProduct = lv_cmake(
357			sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
358			sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
359			}
360			}
361
362	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (number = 0; number < (num_points % 8); ++number) {
363		14	lv_16sc_t tmp = (_in_a++) (*_in_b++);
364		14	dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
365			sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
366			}
367
368		2	*_out = dotProduct;
369		2	}
370			#endif /* LV_HAVE_AVX2 */
371
372
373			#ifdef LV_HAVE_AVX2
374			#include <immintrin.h>
375
376		2	static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
377			const lv_16sc_t* in_a,
378			const lv_16sc_t* in_b,
379			unsigned int num_points)
380			{
381		2	lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
382
383		2	const unsigned int avx_iters = num_points / 8;
384
385		2	const lv_16sc_t* _in_a = in_a;
386		2	const lv_16sc_t* _in_b = in_b;
387		2	lv_16sc_t* _out = out;
388			unsigned int number;
389
390	1/2 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken.	2	if (avx_iters > 0) {
391			__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
392			realcacc, imagcacc, result;
393			__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
394
395		2	realcacc = _mm256_setzero_si256();
396		2	imagcacc = _mm256_setzero_si256();
397
398		2	mask_imag = _mm256_set_epi8(0xFF,
399			0xFF,
400			0,
401			0,
402			0xFF,
403			0xFF,
404			0,
405			0,
406			0xFF,
407			0xFF,
408			0,
409			0,
410			0xFF,
411			0xFF,
412			0,
413			0,
414			0xFF,
415			0xFF,
416			0,
417			0,
418			0xFF,
419			0xFF,
420			0,
421			0,
422			0xFF,
423			0xFF,
424			0,
425			0,
426			0xFF,
427			0xFF,
428			0,
429			0);
430		2	mask_real = _mm256_set_epi8(0,
431			0,
432			0xFF,
433			0xFF,
434			0,
435			0,
436			0xFF,
437			0xFF,
438			0,
439			0,
440			0xFF,
441			0xFF,
442			0,
443			0,
444			0xFF,
445			0xFF,
446			0,
447			0,
448			0xFF,
449			0xFF,
450			0,
451			0,
452			0xFF,
453			0xFF,
454			0,
455			0,
456			0xFF,
457			0xFF,
458			0,
459			0,
460			0xFF,
461			0xFF);
462
463	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (number = 0; number < avx_iters; number++) {
464		32766	a = _mm256_load_si256((__m256i*)_in_a);
465		32766	__VOLK_PREFETCH(_in_a + 16);
466		32766	b = _mm256_load_si256((__m256i*)_in_b);
467		32766	__VOLK_PREFETCH(_in_b + 16);
468		32766	c = _mm256_mullo_epi16(a, b);
469
470		32766	c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
471			// in zeros, and store the results in dst.
472		32766	real = _mm256_subs_epi16(c, c_sr);
473
474		32766	b_sl = _mm256_slli_si256(b, 2);
475		32766	a_sl = _mm256_slli_si256(a, 2);
476
477		32766	imag1 = _mm256_mullo_epi16(a, b_sl);
478		32766	imag2 = _mm256_mullo_epi16(b, a_sl);
479
480		32766	imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
481
482		32766	realcacc = _mm256_adds_epi16(realcacc, real);
483		32766	imagcacc = _mm256_adds_epi16(imagcacc, imag);
484
485		32766	_in_a += 8;
486		32766	_in_b += 8;
487			}
488
489		2	realcacc = _mm256_and_si256(realcacc, mask_real);
490		2	imagcacc = _mm256_and_si256(imagcacc, mask_imag);
491
492		2	result = _mm256_or_si256(realcacc, imagcacc);
493
494			_mm256_store_si256((__m256i*)dotProductVector,
495			result); // Store the results back into the dot product vector
496
497	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 2 times.	18	for (number = 0; number < 8; ++number) {
498		16	dotProduct = lv_cmake(
499			sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
500			sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
501			}
502			}
503
504	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (number = 0; number < (num_points % 8); ++number) {
505		14	lv_16sc_t tmp = (_in_a++) (*_in_b++);
506		14	dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
507			sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
508			}
509
510		2	*_out = dotProduct;
511		2	}
512			#endif /* LV_HAVE_AVX2 */
513
514
515			#ifdef LV_HAVE_NEON
516			#include <arm_neon.h>
517
518			static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
519			const lv_16sc_t* in_a,
520			const lv_16sc_t* in_b,
521			unsigned int num_points)
522			{
523			unsigned int quarter_points = num_points / 4;
524			unsigned int number;
525
526			lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
527			lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
528			*out = lv_cmake((int16_t)0, (int16_t)0);
529
530			if (quarter_points > 0) {
531			// for 2-lane vectors, 1st lane holds the real part,
532			// 2nd lane holds the imaginary part
533			int16x4x2_t a_val, b_val, c_val, accumulator;
534			int16x4x2_t tmp_real, tmp_imag;
535			__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
536			accumulator.val[0] = vdup_n_s16(0);
537			accumulator.val[1] = vdup_n_s16(0);
538			lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
539
540			for (number = 0; number < quarter_points; ++number) {
541			a_val = vld2_s16((int16_t*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
542			b_val = vld2_s16((int16_t*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
543			__VOLK_PREFETCH(a_ptr + 8);
544			__VOLK_PREFETCH(b_ptr + 8);
545
546			// multiply the realreal and imagimag to get real result
547			// a0rb0r\|a1rb1r\|a2rb2r\|a3rb3r
548			tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
549			// a0ib0i\|a1ib1i\|a2ib2i\|a3ib3i
550			tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
551
552			// Multiply cross terms to get the imaginary result
553			// a0rb0i\|a1rb1i\|a2rb2i\|a3rb3i
554			tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
555			// a0ib0r\|a1ib1r\|a2ib2r\|a3ib3r
556			tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
557
558			c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
559			c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
560
561			accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
562			accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
563
564			a_ptr += 4;
565			b_ptr += 4;
566			}
567
568			vst2_s16((int16_t*)accum_result, accumulator);
569			for (number = 0; number < 4; ++number) {
570			dotProduct = lv_cmake(
571			sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
572			sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
573			}
574
575			*out = dotProduct;
576			}
577
578			// tail case
579			for (number = quarter_points * 4; number < num_points; ++number) {
580			out += (a_ptr++) * (*b_ptr++);
581			}
582			}
583
584			#endif /* LV_HAVE_NEON */
585
586
587			#ifdef LV_HAVE_NEON
588			#include <arm_neon.h>
589
590			static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
591			const lv_16sc_t* in_a,
592			const lv_16sc_t* in_b,
593			unsigned int num_points)
594			{
595			unsigned int quarter_points = num_points / 4;
596			unsigned int number;
597
598			lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
599			lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
600			// for 2-lane vectors, 1st lane holds the real part,
601			// 2nd lane holds the imaginary part
602			int16x4x2_t a_val, b_val, accumulator;
603			int16x4x2_t tmp;
604			__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
605			accumulator.val[0] = vdup_n_s16(0);
606			accumulator.val[1] = vdup_n_s16(0);
607
608			for (number = 0; number < quarter_points; ++number) {
609			a_val = vld2_s16((int16_t*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
610			b_val = vld2_s16((int16_t*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
611			__VOLK_PREFETCH(a_ptr + 8);
612			__VOLK_PREFETCH(b_ptr + 8);
613
614			tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
615			tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
616
617			// use multiply accumulate/subtract to get result
618			tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
619			tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
620
621			accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
622			accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
623
624			a_ptr += 4;
625			b_ptr += 4;
626			}
627
628			vst2_s16((int16_t*)accum_result, accumulator);
629			*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
630
631			// tail case
632			for (number = quarter_points * 4; number < num_points; ++number) {
633			out += (a_ptr++) * (*b_ptr++);
634			}
635			}
636
637			#endif /* LV_HAVE_NEON */
638
639
640			#ifdef LV_HAVE_NEON
641			#include <arm_neon.h>
642
643			static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
644			const lv_16sc_t* in_a,
645			const lv_16sc_t* in_b,
646			unsigned int num_points)
647			{
648			unsigned int quarter_points = num_points / 4;
649			unsigned int number;
650
651			lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
652			lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
653			// for 2-lane vectors, 1st lane holds the real part,
654			// 2nd lane holds the imaginary part
655			int16x4x2_t a_val, b_val, accumulator1, accumulator2;
656
657			__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
658			accumulator1.val[0] = vdup_n_s16(0);
659			accumulator1.val[1] = vdup_n_s16(0);
660			accumulator2.val[0] = vdup_n_s16(0);
661			accumulator2.val[1] = vdup_n_s16(0);
662
663			for (number = 0; number < quarter_points; ++number) {
664			a_val = vld2_s16((int16_t*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
665			b_val = vld2_s16((int16_t*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
666			__VOLK_PREFETCH(a_ptr + 8);
667			__VOLK_PREFETCH(b_ptr + 8);
668
669			// use 2 accumulators to remove inter-instruction data dependencies
670			accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
671			accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
672			accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
673			accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
674
675			a_ptr += 4;
676			b_ptr += 4;
677			}
678
679			accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
680			accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
681
682			vst2_s16((int16_t*)accum_result, accumulator1);
683			*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
684
685			// tail case
686			for (number = quarter_points * 4; number < num_points; ++number) {
687			out += (a_ptr++) * (*b_ptr++);
688			}
689			}
690
691			#endif /* LV_HAVE_NEON */
692
693			#endif /INCLUDED_volk_16ic_x2_dot_prod_16ic_H/
694