GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_16ic_x2_multiply_16ic.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	116	116	100.0%
Functions:	5	5	100.0%
Branches:	18	18	100.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2016 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_16ic_x2_multiply_16ic
    
       *
    
       * \b Overview
    
       *
    
       * Multiplies two input complex vectors, point-by-point, storing the result in the third
    
       * vector. WARNING: Saturation is not checked.
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
    
       * lv_16sc_t* in_b, unsigned int num_points); \endcode
    
       *
    
       * \b Inputs
    
       * \li in_a: One of the vectors to be multiplied.
    
       * \li in_b: The other vector to be multiplied.
    
       * \li num_points: The number of complex data points to be multiplied from both input
    
       * vectors.
    
       *
    
       * \b Outputs
    
       * \li result: The vector where the results will be stored.
    
       *
    
       */
    
      #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
    
      #define INCLUDED_volk_16ic_x2_multiply_16ic_H
    
      #include <volk/volk_common.h>
    
      #include <volk/volk_complex.h>
    
      #ifdef LV_HAVE_GENERIC
    
      2
      static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
    
                                                            const lv_16sc_t* in_a,
    
                                                            const lv_16sc_t* in_b,
    
                                                            unsigned int num_points)
    
      {
    
          unsigned int n;
    
        2/2✓ Branch 0 taken 262142 times.
✓ Branch 1 taken 2 times.

      262144
          for (n = 0; n < num_points; n++) {
    
      262142
              result[n] = in_a[n] * in_b[n];
    
          }
    
      2
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          const unsigned int sse_iters = num_points / 4;
    
          __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
    
              result;
    
      2
          mask_imag = _mm_set_epi8(
    
              0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
    
      2
          mask_real = _mm_set_epi8(
    
              0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          unsigned int number;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (number = 0; number < sse_iters; number++) {
    
      65534
              a = _mm_load_si128(
    
                  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
    
      65534
              b = _mm_load_si128((__m128i*)_in_b);
    
      65534
              c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
    
      65534
              c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
    
                                           // zeros, and store the results in dst.
    
      65534
              real = _mm_subs_epi16(c, c_sr);
    
      65534
              real = _mm_and_si128(real,
    
                                   mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
    
      65534
              b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
    
      65534
              a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
    
      65534
              imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
    
      65534
              imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
    
      65534
              imag = _mm_adds_epi16(imag1, imag2);
    
      65534
              imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
    
      65534
              result = _mm_or_si128(real, imag);
    
              _mm_store_si128((__m128i*)_out, result);
    
      65534
              _in_a += 4;
    
      65534
              _in_b += 4;
    
      65534
              _out += 4;
    
          }
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (number = sse_iters * 4; number < num_points; ++number) {
    
      6
              *_out++ = (*_in_a++) * (*_in_b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      2
      static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          const unsigned int sse_iters = num_points / 4;
    
          __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
    
              result;
    
      2
          mask_imag = _mm_set_epi8(
    
              0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
    
      2
          mask_real = _mm_set_epi8(
    
              0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          unsigned int number;
    
        2/2✓ Branch 0 taken 65534 times.
✓ Branch 1 taken 2 times.

      65536
          for (number = 0; number < sse_iters; number++) {
    
      65534
              a = _mm_loadu_si128(
    
                  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
    
      65534
              b = _mm_loadu_si128((__m128i*)_in_b);
    
      65534
              c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
    
      65534
              c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
    
                                           // zeros, and store the results in dst.
    
      65534
              real = _mm_subs_epi16(c, c_sr);
    
      65534
              real = _mm_and_si128(real,
    
                                   mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
    
      65534
              b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
    
      65534
              a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
    
      65534
              imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
    
      65534
              imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
    
      65534
              imag = _mm_adds_epi16(imag1, imag2);
    
      65534
              imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
    
      65534
              result = _mm_or_si128(real, imag);
    
              _mm_storeu_si128((__m128i*)_out, result);
    
      65534
              _in_a += 4;
    
      65534
              _in_b += 4;
    
      65534
              _out += 4;
    
          }
    
        2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2 times.

      8
          for (number = sse_iters * 4; number < num_points; ++number) {
    
      6
              *_out++ = (*_in_a++) * (*_in_b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_SSE2 */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int avx2_points = num_points / 8;
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
    
      2
          const __m256i mask_imag = _mm256_set_epi8(0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0);
    
      2
          const __m256i mask_real = _mm256_set_epi8(0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < avx2_points; number++) {
    
      32766
              a = _mm256_loadu_si256(
    
                  (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
      32766
              b = _mm256_loadu_si256(
    
                  (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      32766
              c = _mm256_mullo_epi16(a, b);
    
      32766
              c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
    
                                              // zeros, and store the results in dst.
    
      32766
              real = _mm256_subs_epi16(c, c_sr);
    
      32766
              real = _mm256_and_si256(
    
                  real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
    
      32766
              b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
    
      32766
              a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
    
      32766
              imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
    
      32766
              imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
    
      32766
              imag = _mm256_adds_epi16(imag1, imag2);
    
      32766
              imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
    
      32766
              result = _mm256_or_si256(real, imag);
    
              _mm256_storeu_si256((__m256i*)_out, result);
    
      32766
              _in_a += 8;
    
      32766
              _in_b += 8;
    
      32766
              _out += 8;
    
          }
    
      2
          number = avx2_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *_out++ = (*_in_a++) * (*_in_b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2  */
    
      #ifdef LV_HAVE_AVX2
    
      #include <immintrin.h>
    
      2
      static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
    
                                                           const lv_16sc_t* in_a,
    
                                                           const lv_16sc_t* in_b,
    
                                                           unsigned int num_points)
    
      {
    
      2
          unsigned int number = 0;
    
      2
          const unsigned int avx2_points = num_points / 8;
    
      2
          const lv_16sc_t* _in_a = in_a;
    
      2
          const lv_16sc_t* _in_b = in_b;
    
      2
          lv_16sc_t* _out = out;
    
          __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
    
      2
          const __m256i mask_imag = _mm256_set_epi8(0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0);
    
      2
          const __m256i mask_real = _mm256_set_epi8(0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF,
    
                                                    0,
    
                                                    0,
    
                                                    0xFF,
    
                                                    0xFF);
    
        2/2✓ Branch 0 taken 32766 times.
✓ Branch 1 taken 2 times.

      32768
          for (; number < avx2_points; number++) {
    
      32766
              a = _mm256_load_si256(
    
                  (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
    
      32766
              b = _mm256_load_si256(
    
                  (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
    
      32766
              c = _mm256_mullo_epi16(a, b);
    
      32766
              c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
    
                                              // zeros, and store the results in dst.
    
      32766
              real = _mm256_subs_epi16(c, c_sr);
    
      32766
              real = _mm256_and_si256(
    
                  real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
    
      32766
              b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
    
      32766
              a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
    
      32766
              imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
    
      32766
              imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
    
      32766
              imag = _mm256_adds_epi16(imag1, imag2);
    
      32766
              imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
    
      32766
              result = _mm256_or_si256(real, imag);
    
              _mm256_store_si256((__m256i*)_out, result);
    
      32766
              _in_a += 8;
    
      32766
              _in_b += 8;
    
      32766
              _out += 8;
    
          }
    
      2
          number = avx2_points * 8;
    
        2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 2 times.

      16
          for (; number < num_points; number++) {
    
      14
              *_out++ = (*_in_a++) * (*_in_b++);
    
          }
    
      2
      }
    
      #endif /* LV_HAVE_AVX2  */
    
      #ifdef LV_HAVE_NEON
    
      #include <arm_neon.h>
    
      static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
    
                                                         const lv_16sc_t* in_a,
    
                                                         const lv_16sc_t* in_b,
    
                                                         unsigned int num_points)
    
      {
    
          lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
    
          lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
    
          unsigned int quarter_points = num_points / 4;
    
          int16x4x2_t a_val, b_val, c_val;
    
          int16x4x2_t tmp_real, tmp_imag;
    
          unsigned int number = 0;
    
          for (number = 0; number < quarter_points; ++number) {
    
              a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
    
              b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
    
              __VOLK_PREFETCH(a_ptr + 4);
    
              __VOLK_PREFETCH(b_ptr + 4);
    
              // multiply the real*real and imag*imag to get real result
    
              // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
    
              tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
    
              // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
    
              tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
    
              // Multiply cross terms to get the imaginary result
    
              // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
    
              tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
    
              // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
    
              tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
    
              // store the results
    
              c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
    
              c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
    
              vst2_s16((int16_t*)out, c_val);
    
              a_ptr += 4;
    
              b_ptr += 4;
    
              out += 4;
    
          }
    
          for (number = quarter_points * 4; number < num_points; number++) {
    
              *out++ = (*a_ptr++) * (*b_ptr++);
    
          }
    
      }
    
      #endif /* LV_HAVE_NEON */
    
      #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/

Line	Branch	Exec	Source
1			/* -- c++ -- */
2			/*
3			* Copyright 2016 Free Software Foundation, Inc.
4			*
5			* This file is part of VOLK
6			*
7			* SPDX-License-Identifier: LGPL-3.0-or-later
8			*/
9
10			/*!
11			* \page volk_16ic_x2_multiply_16ic
12			*
13			* \b Overview
14			*
15			* Multiplies two input complex vectors, point-by-point, storing the result in the third
16			* vector. WARNING: Saturation is not checked.
17			*
18			* <b>Dispatcher Prototype</b>
19			* \code
20			* void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
21			* lv_16sc_t* in_b, unsigned int num_points); \endcode
22			*
23			* \b Inputs
24			* \li in_a: One of the vectors to be multiplied.
25			* \li in_b: The other vector to be multiplied.
26			* \li num_points: The number of complex data points to be multiplied from both input
27			* vectors.
28			*
29			* \b Outputs
30			* \li result: The vector where the results will be stored.
31			*
32			*/
33
34			#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35			#define INCLUDED_volk_16ic_x2_multiply_16ic_H
36
37			#include <volk/volk_common.h>
38			#include <volk/volk_complex.h>
39
40			#ifdef LV_HAVE_GENERIC
41
42		2	static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
43			const lv_16sc_t* in_a,
44			const lv_16sc_t* in_b,
45			unsigned int num_points)
46			{
47			unsigned int n;
48	2/2 ✓ Branch 0 taken 262142 times. ✓ Branch 1 taken 2 times.	262144	for (n = 0; n < num_points; n++) {
49		262142	result[n] = in_a[n] * in_b[n];
50			}
51		2	}
52
53			#endif /LV_HAVE_GENERIC/
54
55
56			#ifdef LV_HAVE_SSE2
57			#include <emmintrin.h>
58
59		2	static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
60			const lv_16sc_t* in_a,
61			const lv_16sc_t* in_b,
62			unsigned int num_points)
63			{
64		2	const unsigned int sse_iters = num_points / 4;
65			__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
66			result;
67
68		2	mask_imag = _mm_set_epi8(
69			0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
70		2	mask_real = _mm_set_epi8(
71			0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
72
73		2	const lv_16sc_t* _in_a = in_a;
74		2	const lv_16sc_t* _in_b = in_b;
75		2	lv_16sc_t* _out = out;
76			unsigned int number;
77
78	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (number = 0; number < sse_iters; number++) {
79		65534	a = _mm_load_si128(
80			(__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
81		65534	b = _mm_load_si128((__m128i*)_in_b);
82		65534	c = _mm_mullo_epi16(a, b); // a3.ib3.i, a3.rb3.r, ....
83
84		65534	c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
85			// zeros, and store the results in dst.
86		65534	real = _mm_subs_epi16(c, c_sr);
87		65534	real = _mm_and_si128(real,
88			mask_real); // a3.rb3.r-a3.ib3.i , 0, a3.rb3.r- a3.ib3.i
89
90		65534	b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
91		65534	a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
92
93		65534	imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
94		65534	imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
95
96		65534	imag = _mm_adds_epi16(imag1, imag2);
97		65534	imag = _mm_and_si128(imag, mask_imag); // a3.ib3.r+b3.ia3.r, 0, ...
98
99		65534	result = _mm_or_si128(real, imag);
100
101			_mm_store_si128((__m128i*)_out, result);
102
103		65534	_in_a += 4;
104		65534	_in_b += 4;
105		65534	_out += 4;
106			}
107
108	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (number = sse_iters * 4; number < num_points; ++number) {
109		6	_out++ = (_in_a++) * (*_in_b++);
110			}
111		2	}
112			#endif /* LV_HAVE_SSE2 */
113
114
115			#ifdef LV_HAVE_SSE2
116			#include <emmintrin.h>
117
118		2	static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
119			const lv_16sc_t* in_a,
120			const lv_16sc_t* in_b,
121			unsigned int num_points)
122			{
123		2	const unsigned int sse_iters = num_points / 4;
124			__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
125			result;
126
127		2	mask_imag = _mm_set_epi8(
128			0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
129		2	mask_real = _mm_set_epi8(
130			0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
131
132		2	const lv_16sc_t* _in_a = in_a;
133		2	const lv_16sc_t* _in_b = in_b;
134		2	lv_16sc_t* _out = out;
135			unsigned int number;
136
137	2/2 ✓ Branch 0 taken 65534 times. ✓ Branch 1 taken 2 times.	65536	for (number = 0; number < sse_iters; number++) {
138		65534	a = _mm_loadu_si128(
139			(__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
140		65534	b = _mm_loadu_si128((__m128i*)_in_b);
141		65534	c = _mm_mullo_epi16(a, b); // a3.ib3.i, a3.rb3.r, ....
142
143		65534	c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
144			// zeros, and store the results in dst.
145		65534	real = _mm_subs_epi16(c, c_sr);
146		65534	real = _mm_and_si128(real,
147			mask_real); // a3.rb3.r-a3.ib3.i , 0, a3.rb3.r- a3.ib3.i
148
149		65534	b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
150		65534	a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
151
152		65534	imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
153		65534	imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
154
155		65534	imag = _mm_adds_epi16(imag1, imag2);
156		65534	imag = _mm_and_si128(imag, mask_imag); // a3.ib3.r+b3.ia3.r, 0, ...
157
158		65534	result = _mm_or_si128(real, imag);
159
160			_mm_storeu_si128((__m128i*)_out, result);
161
162		65534	_in_a += 4;
163		65534	_in_b += 4;
164		65534	_out += 4;
165			}
166
167	2/2 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 2 times.	8	for (number = sse_iters * 4; number < num_points; ++number) {
168		6	_out++ = (_in_a++) * (*_in_b++);
169			}
170		2	}
171			#endif /* LV_HAVE_SSE2 */
172
173
174			#ifdef LV_HAVE_AVX2
175			#include <immintrin.h>
176
177		2	static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
178			const lv_16sc_t* in_a,
179			const lv_16sc_t* in_b,
180			unsigned int num_points)
181			{
182		2	unsigned int number = 0;
183		2	const unsigned int avx2_points = num_points / 8;
184
185		2	const lv_16sc_t* _in_a = in_a;
186		2	const lv_16sc_t* _in_b = in_b;
187		2	lv_16sc_t* _out = out;
188
189			__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
190
191		2	const __m256i mask_imag = _mm256_set_epi8(0xFF,
192			0xFF,
193			0,
194			0,
195			0xFF,
196			0xFF,
197			0,
198			0,
199			0xFF,
200			0xFF,
201			0,
202			0,
203			0xFF,
204			0xFF,
205			0,
206			0,
207			0xFF,
208			0xFF,
209			0,
210			0,
211			0xFF,
212			0xFF,
213			0,
214			0,
215			0xFF,
216			0xFF,
217			0,
218			0,
219			0xFF,
220			0xFF,
221			0,
222			0);
223		2	const __m256i mask_real = _mm256_set_epi8(0,
224			0,
225			0xFF,
226			0xFF,
227			0,
228			0,
229			0xFF,
230			0xFF,
231			0,
232			0,
233			0xFF,
234			0xFF,
235			0,
236			0,
237			0xFF,
238			0xFF,
239			0,
240			0,
241			0xFF,
242			0xFF,
243			0,
244			0,
245			0xFF,
246			0xFF,
247			0,
248			0,
249			0xFF,
250			0xFF,
251			0,
252			0,
253			0xFF,
254			0xFF);
255
256	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < avx2_points; number++) {
257		32766	a = _mm256_loadu_si256(
258			(__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
259		32766	b = _mm256_loadu_si256(
260			(__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
261		32766	c = _mm256_mullo_epi16(a, b);
262
263		32766	c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
264			// zeros, and store the results in dst.
265		32766	real = _mm256_subs_epi16(c, c_sr);
266		32766	real = _mm256_and_si256(
267			real, mask_real); // a3.rb3.r-a3.ib3.i , 0, a3.rb3.r- a3.ib3.i
268
269		32766	b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
270		32766	a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
271
272		32766	imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
273		32766	imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
274
275		32766	imag = _mm256_adds_epi16(imag1, imag2);
276		32766	imag = _mm256_and_si256(imag, mask_imag); // a3.ib3.r+b3.ia3.r, 0, ...
277
278		32766	result = _mm256_or_si256(real, imag);
279
280			_mm256_storeu_si256((__m256i*)_out, result);
281
282		32766	_in_a += 8;
283		32766	_in_b += 8;
284		32766	_out += 8;
285			}
286
287		2	number = avx2_points * 8;
288	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
289		14	_out++ = (_in_a++) * (*_in_b++);
290			}
291		2	}
292			#endif /* LV_HAVE_AVX2 */
293
294
295			#ifdef LV_HAVE_AVX2
296			#include <immintrin.h>
297
298		2	static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
299			const lv_16sc_t* in_a,
300			const lv_16sc_t* in_b,
301			unsigned int num_points)
302			{
303		2	unsigned int number = 0;
304		2	const unsigned int avx2_points = num_points / 8;
305
306		2	const lv_16sc_t* _in_a = in_a;
307		2	const lv_16sc_t* _in_b = in_b;
308		2	lv_16sc_t* _out = out;
309
310			__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
311
312		2	const __m256i mask_imag = _mm256_set_epi8(0xFF,
313			0xFF,
314			0,
315			0,
316			0xFF,
317			0xFF,
318			0,
319			0,
320			0xFF,
321			0xFF,
322			0,
323			0,
324			0xFF,
325			0xFF,
326			0,
327			0,
328			0xFF,
329			0xFF,
330			0,
331			0,
332			0xFF,
333			0xFF,
334			0,
335			0,
336			0xFF,
337			0xFF,
338			0,
339			0,
340			0xFF,
341			0xFF,
342			0,
343			0);
344		2	const __m256i mask_real = _mm256_set_epi8(0,
345			0,
346			0xFF,
347			0xFF,
348			0,
349			0,
350			0xFF,
351			0xFF,
352			0,
353			0,
354			0xFF,
355			0xFF,
356			0,
357			0,
358			0xFF,
359			0xFF,
360			0,
361			0,
362			0xFF,
363			0xFF,
364			0,
365			0,
366			0xFF,
367			0xFF,
368			0,
369			0,
370			0xFF,
371			0xFF,
372			0,
373			0,
374			0xFF,
375			0xFF);
376
377	2/2 ✓ Branch 0 taken 32766 times. ✓ Branch 1 taken 2 times.	32768	for (; number < avx2_points; number++) {
378		32766	a = _mm256_load_si256(
379			(__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
380		32766	b = _mm256_load_si256(
381			(__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
382		32766	c = _mm256_mullo_epi16(a, b);
383
384		32766	c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
385			// zeros, and store the results in dst.
386		32766	real = _mm256_subs_epi16(c, c_sr);
387		32766	real = _mm256_and_si256(
388			real, mask_real); // a3.rb3.r-a3.ib3.i , 0, a3.rb3.r- a3.ib3.i
389
390		32766	b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
391		32766	a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
392
393		32766	imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
394		32766	imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
395
396		32766	imag = _mm256_adds_epi16(imag1, imag2);
397		32766	imag = _mm256_and_si256(imag, mask_imag); // a3.ib3.r+b3.ia3.r, 0, ...
398
399		32766	result = _mm256_or_si256(real, imag);
400
401			_mm256_store_si256((__m256i*)_out, result);
402
403		32766	_in_a += 8;
404		32766	_in_b += 8;
405		32766	_out += 8;
406			}
407
408		2	number = avx2_points * 8;
409	2/2 ✓ Branch 0 taken 14 times. ✓ Branch 1 taken 2 times.	16	for (; number < num_points; number++) {
410		14	_out++ = (_in_a++) * (*_in_b++);
411			}
412		2	}
413			#endif /* LV_HAVE_AVX2 */
414
415
416			#ifdef LV_HAVE_NEON
417			#include <arm_neon.h>
418
419			static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
420			const lv_16sc_t* in_a,
421			const lv_16sc_t* in_b,
422			unsigned int num_points)
423			{
424			lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
425			lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
426			unsigned int quarter_points = num_points / 4;
427			int16x4x2_t a_val, b_val, c_val;
428			int16x4x2_t tmp_real, tmp_imag;
429			unsigned int number = 0;
430
431			for (number = 0; number < quarter_points; ++number) {
432			a_val = vld2_s16((int16_t*)a_ptr); // a0r\|a1r\|a2r\|a3r \|\| a0i\|a1i\|a2i\|a3i
433			b_val = vld2_s16((int16_t*)b_ptr); // b0r\|b1r\|b2r\|b3r \|\| b0i\|b1i\|b2i\|b3i
434			__VOLK_PREFETCH(a_ptr + 4);
435			__VOLK_PREFETCH(b_ptr + 4);
436
437			// multiply the realreal and imagimag to get real result
438			// a0rb0r\|a1rb1r\|a2rb2r\|a3rb3r
439			tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
440			// a0ib0i\|a1ib1i\|a2ib2i\|a3ib3i
441			tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
442
443			// Multiply cross terms to get the imaginary result
444			// a0rb0i\|a1rb1i\|a2rb2i\|a3rb3i
445			tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
446			// a0ib0r\|a1ib1r\|a2ib2r\|a3ib3r
447			tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
448
449			// store the results
450			c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451			c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452			vst2_s16((int16_t*)out, c_val);
453
454			a_ptr += 4;
455			b_ptr += 4;
456			out += 4;
457			}
458
459			for (number = quarter_points * 4; number < num_points; number++) {
460			out++ = (a_ptr++) * (*b_ptr++);
461			}
462			}
463			#endif /* LV_HAVE_NEON */
464
465			#endif /INCLUDED_volk_16ic_x2_multiply_16ic_H/
466