GCC Code Coverage Report

Directory:	./
File:	kernels/volk/volk_16i_permute_and_scalar_add.h
Date:	2023-10-23 23:10:04

	Total	Coverage
Lines:	69	0.0%
Functions:	2	0.0%
Branches:	6	0.0%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2012, 2014 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*!
    
       * \page volk_16i_permute_and_scalar_add
    
       *
    
       * \b Overview
    
       *
    
       * <FIXME>
    
       *
    
       * <b>Dispatcher Prototype</b>
    
       * \code
    
       * void volk_16i_permute_and_scalar_add(short* target,  short* src0, short*
    
       * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short*
    
       * scalars, unsigned int num_points) \endcode
    
       *
    
       * \b Inputs
    
       * \li src0: The input vector.
    
       * \li permute_indexes: <FIXME>
    
       * \li cntl0: <FIXME>
    
       * \li cntl1: <FIXME>
    
       * \li cntl2: <FIXME>
    
       * \li cntl3: <FIXME>
    
       * \li scalars: <FIXME>
    
       * \li num_points: The number of complex data points.
    
       *
    
       * \b Outputs
    
       * \li target: The output value.
    
       *
    
       * \b Example
    
       * \code
    
       * int N = 10000;
    
       *
    
       * volk_16i_permute_and_scalar_add();
    
       *
    
       * volk_free(x);
    
       * \endcode
    
       */
    
      #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
    
      #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
    
      #include <inttypes.h>
    
      #include <stdio.h>
    
      #ifdef LV_HAVE_SSE2
    
      #include <emmintrin.h>
    
      #include <xmmintrin.h>
    
      ✗
      static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
    
                                                                short* src0,
    
                                                                short* permute_indexes,
    
                                                                short* cntl0,
    
                                                                short* cntl1,
    
                                                                short* cntl2,
    
                                                                short* cntl3,
    
                                                                short* scalars,
    
                                                                unsigned int num_points)
    
      {
    
      ✗
          const unsigned int num_bytes = num_points * 2;
    
          __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
    
          __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
    
      ✗
          short* p_permute_indexes = permute_indexes;
    
      ✗
          p_target = (__m128i*)target;
    
      ✗
          p_cntl0 = (__m128i*)cntl0;
    
      ✗
          p_cntl1 = (__m128i*)cntl1;
    
      ✗
          p_cntl2 = (__m128i*)cntl2;
    
      ✗
          p_cntl3 = (__m128i*)cntl3;
    
      ✗
          p_scalars = (__m128i*)scalars;
    
      ✗
          int i = 0;
    
      ✗
          int bound = (num_bytes >> 4);
    
      ✗
          int leftovers = (num_bytes >> 1) & 7;
    
      ✗
          xmm0 = _mm_load_si128(p_scalars);
    
      ✗
          xmm1 = _mm_shufflelo_epi16(xmm0, 0);
    
      ✗
          xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
    
      ✗
          xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
    
      ✗
          xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
    
      ✗
          xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
    
      ✗
          xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
    
      ✗
          xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
    
      ✗
          xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
    
      ✗
          for (; i < bound; ++i) {
    
      ✗
              xmm0 = _mm_setzero_si128();
    
      ✗
              xmm5 = _mm_setzero_si128();
    
      ✗
              xmm6 = _mm_setzero_si128();
    
      ✗
              xmm7 = _mm_setzero_si128();
    
      ✗
              xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
    
      ✗
              xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
    
      ✗
              xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
    
      ✗
              xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
    
      ✗
              xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
    
      ✗
              xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
    
      ✗
              xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
    
      ✗
              xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
    
      ✗
              xmm0 = _mm_add_epi16(xmm0, xmm5);
    
      ✗
              xmm6 = _mm_add_epi16(xmm6, xmm7);
    
      ✗
              p_permute_indexes += 8;
    
      ✗
              xmm0 = _mm_add_epi16(xmm0, xmm6);
    
      ✗
              xmm5 = _mm_load_si128(p_cntl0);
    
      ✗
              xmm6 = _mm_load_si128(p_cntl1);
    
      ✗
              xmm7 = _mm_load_si128(p_cntl2);
    
      ✗
              xmm5 = _mm_and_si128(xmm5, xmm1);
    
      ✗
              xmm6 = _mm_and_si128(xmm6, xmm2);
    
      ✗
              xmm7 = _mm_and_si128(xmm7, xmm3);
    
      ✗
              xmm0 = _mm_add_epi16(xmm0, xmm5);
    
      ✗
              xmm5 = _mm_load_si128(p_cntl3);
    
      ✗
              xmm6 = _mm_add_epi16(xmm6, xmm7);
    
      ✗
              p_cntl0 += 1;
    
      ✗
              xmm5 = _mm_and_si128(xmm5, xmm4);
    
      ✗
              xmm0 = _mm_add_epi16(xmm0, xmm6);
    
      ✗
              p_cntl1 += 1;
    
      ✗
              p_cntl2 += 1;
    
      ✗
              xmm0 = _mm_add_epi16(xmm0, xmm5);
    
      ✗
              p_cntl3 += 1;
    
              _mm_store_si128(p_target, xmm0);
    
      ✗
              p_target += 1;
    
          }
    
      ✗
          for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
    
      ✗
              target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
    
      ✗
                          (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
    
      ✗
                          (cntl3[i] & scalars[3]);
    
          }
    
      ✗
      }
    
      #endif /*LV_HAVE_SSE*/
    
      #ifdef LV_HAVE_GENERIC
    
      ✗
      static inline void volk_16i_permute_and_scalar_add_generic(short* target,
    
                                                                 short* src0,
    
                                                                 short* permute_indexes,
    
                                                                 short* cntl0,
    
                                                                 short* cntl1,
    
                                                                 short* cntl2,
    
                                                                 short* cntl3,
    
                                                                 short* scalars,
    
                                                                 unsigned int num_points)
    
      {
    
      ✗
          const unsigned int num_bytes = num_points * 2;
    
      ✗
          int i = 0;
    
      ✗
          int bound = num_bytes >> 1;
    
      ✗
          for (i = 0; i < bound; ++i) {
    
      ✗
              target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
    
      ✗
                          (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
    
      ✗
                          (cntl3[i] & scalars[3]);
    
          }
    
      ✗
      }
    
      #endif /*LV_HAVE_GENERIC*/
    
      #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/

Line	Exec	Source
1		/* -- c++ -- */
2		/*
3		* Copyright 2012, 2014 Free Software Foundation, Inc.
4		*
5		* This file is part of VOLK
6		*
7		* SPDX-License-Identifier: LGPL-3.0-or-later
8		*/
9
10		/*!
11		* \page volk_16i_permute_and_scalar_add
12		*
13		* \b Overview
14		*
15		* <FIXME>
16		*
17		* <b>Dispatcher Prototype</b>
18		* \code
19		* void volk_16i_permute_and_scalar_add(short* target, short* src0, short*
20		* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short*
21		* scalars, unsigned int num_points) \endcode
22		*
23		* \b Inputs
24		* \li src0: The input vector.
25		* \li permute_indexes: <FIXME>
26		* \li cntl0: <FIXME>
27		* \li cntl1: <FIXME>
28		* \li cntl2: <FIXME>
29		* \li cntl3: <FIXME>
30		* \li scalars: <FIXME>
31		* \li num_points: The number of complex data points.
32		*
33		* \b Outputs
34		* \li target: The output value.
35		*
36		* \b Example
37		* \code
38		* int N = 10000;
39		*
40		* volk_16i_permute_and_scalar_add();
41		*
42		* volk_free(x);
43		* \endcode
44		*/
45
46		#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
47		#define INCLUDED_volk_16i_permute_and_scalar_add_a_H
48
49		#include <inttypes.h>
50		#include <stdio.h>
51
52		#ifdef LV_HAVE_SSE2
53
54		#include <emmintrin.h>
55		#include <xmmintrin.h>
56
57	✗	static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
58		short* src0,
59		short* permute_indexes,
60		short* cntl0,
61		short* cntl1,
62		short* cntl2,
63		short* cntl3,
64		short* scalars,
65		unsigned int num_points)
66		{
67
68	✗	const unsigned int num_bytes = num_points * 2;
69
70		__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
71
72		__m128i p_target, p_cntl0, p_cntl1, p_cntl2, p_cntl3, p_scalars;
73
74	✗	short* p_permute_indexes = permute_indexes;
75
76	✗	p_target = (__m128i*)target;
77	✗	p_cntl0 = (__m128i*)cntl0;
78	✗	p_cntl1 = (__m128i*)cntl1;
79	✗	p_cntl2 = (__m128i*)cntl2;
80	✗	p_cntl3 = (__m128i*)cntl3;
81	✗	p_scalars = (__m128i*)scalars;
82
83	✗	int i = 0;
84
85	✗	int bound = (num_bytes >> 4);
86	✗	int leftovers = (num_bytes >> 1) & 7;
87
88	✗	xmm0 = _mm_load_si128(p_scalars);
89
90	✗	xmm1 = _mm_shufflelo_epi16(xmm0, 0);
91	✗	xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
92	✗	xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
93	✗	xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
94
95	✗	xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
96	✗	xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
97	✗	xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
98	✗	xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
99
100
101	✗	for (; i < bound; ++i) {
102	✗	xmm0 = _mm_setzero_si128();
103	✗	xmm5 = _mm_setzero_si128();
104	✗	xmm6 = _mm_setzero_si128();
105	✗	xmm7 = _mm_setzero_si128();
106
107	✗	xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
108	✗	xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
109	✗	xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
110	✗	xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
111	✗	xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
112	✗	xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
113	✗	xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
114	✗	xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
115
116	✗	xmm0 = _mm_add_epi16(xmm0, xmm5);
117	✗	xmm6 = _mm_add_epi16(xmm6, xmm7);
118
119	✗	p_permute_indexes += 8;
120
121	✗	xmm0 = _mm_add_epi16(xmm0, xmm6);
122
123	✗	xmm5 = _mm_load_si128(p_cntl0);
124	✗	xmm6 = _mm_load_si128(p_cntl1);
125	✗	xmm7 = _mm_load_si128(p_cntl2);
126
127	✗	xmm5 = _mm_and_si128(xmm5, xmm1);
128	✗	xmm6 = _mm_and_si128(xmm6, xmm2);
129	✗	xmm7 = _mm_and_si128(xmm7, xmm3);
130
131	✗	xmm0 = _mm_add_epi16(xmm0, xmm5);
132
133	✗	xmm5 = _mm_load_si128(p_cntl3);
134
135	✗	xmm6 = _mm_add_epi16(xmm6, xmm7);
136
137	✗	p_cntl0 += 1;
138
139	✗	xmm5 = _mm_and_si128(xmm5, xmm4);
140
141	✗	xmm0 = _mm_add_epi16(xmm0, xmm6);
142
143	✗	p_cntl1 += 1;
144	✗	p_cntl2 += 1;
145
146	✗	xmm0 = _mm_add_epi16(xmm0, xmm5);
147
148	✗	p_cntl3 += 1;
149
150		_mm_store_si128(p_target, xmm0);
151
152	✗	p_target += 1;
153		}
154
155	✗	for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
156	✗	target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
157	✗	(cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
158	✗	(cntl3[i] & scalars[3]);
159		}
160	✗	}
161		#endif /LV_HAVE_SSE/
162
163
164		#ifdef LV_HAVE_GENERIC
165	✗	static inline void volk_16i_permute_and_scalar_add_generic(short* target,
166		short* src0,
167		short* permute_indexes,
168		short* cntl0,
169		short* cntl1,
170		short* cntl2,
171		short* cntl3,
172		short* scalars,
173		unsigned int num_points)
174		{
175	✗	const unsigned int num_bytes = num_points * 2;
176
177	✗	int i = 0;
178
179	✗	int bound = num_bytes >> 1;
180
181	✗	for (i = 0; i < bound; ++i) {
182	✗	target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
183	✗	(cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
184	✗	(cntl3[i] & scalars[3]);
185		}
186	✗	}
187
188		#endif /LV_HAVE_GENERIC/
189
190		#endif /INCLUDED_volk_16i_permute_and_scalar_add_a_H/
191