GCC Code Coverage Report

Directory:	./
File:	include/volk/volk_avx2_intrinsics.h
Date:	2023-10-23 23:10:04

	Exec	Total	Coverage
Lines:	68	68	100.0%
Functions:	8	8	100.0%
Branches:	0	0	-%

  
      Line
      Branch
      Exec
      Source
    
      /* -*- c++ -*- */
    
      /*
    
       * Copyright 2015 Free Software Foundation, Inc.
    
       *
    
       * This file is part of VOLK
    
       *
    
       * SPDX-License-Identifier: LGPL-3.0-or-later
    
       */
    
      /*
    
       * This file is intended to hold AVX2 intrinsics of intrinsics.
    
       * They should be used in VOLK kernels to avoid copy-paste.
    
       */
    
      #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
    
      #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
    
      #include "volk/volk_avx_intrinsics.h"
    
      #include <immintrin.h>
    
      4608
      static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
    
      {
    
      4608
          const __m128i zeros = _mm_set1_epi8(0x00);
    
      4608
          const __m128i sign_extract = _mm_set1_epi8(0x80);
    
      4608
          const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x00,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x01,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x02,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x03,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x04,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x05,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x06,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0xff,
    
                                                        0x07);
    
      4608
          __m256i sign_bits = _mm256_setzero_si256();
    
      4608
          fbits = _mm_cmpgt_epi8(fbits, zeros);
    
      4608
          fbits = _mm_and_si128(fbits, sign_extract);
    
      4608
          sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
    
      4608
          sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
    
      4608
          sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
    
      4608
          return _mm256_castsi256_ps(sign_bits);
    
      }
    
      static inline __m256
    
      4608
      _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
    
      {
    
          // prepare sign mask for correct +-
    
      4608
          __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
    
          __m256 llr0, llr1;
    
      4608
          _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
    
          // calculate result
    
      4608
          llr0 = _mm256_xor_ps(llr0, sign_mask);
    
      4608
          __m256 dst = _mm256_add_ps(llr0, llr1);
    
      4608
          return dst;
    
      }
    
      65532
      static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
    
                                                           const __m256 cplxValue1)
    
      {
    
      65532
          const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    
      65532
          const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
    
      65532
          const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
    
      65532
          const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
    
      65532
          return _mm256_permutevar8x32_ps(complex_result, idx);
    
      }
    
      65532
      static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
    
                                                           const __m256 symbols1,
    
                                                           const __m256 points0,
    
                                                           const __m256 points1,
    
                                                           const __m256 scalar)
    
      {
    
          /*
    
           * Calculate: |y - x|^2 * SNR_lin
    
           * Consider 'symbolsX' and 'pointsX' to be complex float
    
           * 'symbolsX' are 'y' and 'pointsX' are 'x'
    
           */
    
      65532
          const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
    
      65532
          const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
    
      65532
          const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
    
      65532
          return _mm256_mul_ps(norms, scalar);
    
      }
    
      /*
    
       * The function below vectorizes the inner loop of the following code:
    
       *
    
       * float max_values[8] = {0.f};
    
       * unsigned max_indices[8] = {0};
    
       * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    
       * for (unsigned i = 0; i < num_points / 8; ++i) {
    
       *     for (unsigned j = 0; j < 8; ++j) {
    
       *         float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
    
       *         bool compare = abs_squared > max_values[j];
    
       *         max_values[j] = compare ? abs_squared : max_values[j];
    
       *         max_indices[j] = compare ? current_indices[j] : max_indices[j]
    
       *         current_indices[j] += 8; // update for next outer loop iteration
    
       *         ++src0;
    
       *     }
    
       * }
    
       */
    
      98296
      static inline void vector_32fc_index_max_variant0(__m256 in0,
    
                                                        __m256 in1,
    
                                                        __m256* max_values,
    
                                                        __m256i* max_indices,
    
                                                        __m256i* current_indices,
    
                                                        __m256i indices_increment)
    
      {
    
      98296
          in0 = _mm256_mul_ps(in0, in0);
    
      98296
          in1 = _mm256_mul_ps(in1, in1);
    
          /*
    
           * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
    
           * hadd_ps(a, b) computes
    
           * (b_7 + b_6,
    
           *  b_5 + b_4,
    
           *  ---------
    
           *  a_7 + b_6,
    
           *  a_5 + a_4,
    
           *  ---------
    
           *  b_3 + b_2,
    
           *  b_1 + b_0,
    
           *  ---------
    
           *  a_3 + a_2,
    
           *  a_1 + a_0).
    
           * The result is the squared absolute value of complex numbers at index
    
           * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
    
           * current_indices!
    
           */
    
      98296
          __m256 abs_squared = _mm256_hadd_ps(in0, in1);
    
          /*
    
           * Compare the recently computed squared absolute values with the
    
           * previously determined maximum values. cmp_ps(a, b) determines
    
           * a > b ? 0xFFFFFFFF for each element in the vectors =>
    
           * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
    
           *
    
           * If either operand is NaN, 0 is returned as an “ordered” comparision is
    
           * used => the blend operation will select the value from *max_values.
    
           */
    
      98296
          __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
    
          /* Select maximum by blending. This is the only line which differs from variant1 */
    
      98296
          *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
    
          /*
    
           * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
    
           * each element in the vectors =>
    
           * max_indices = compare_mask ? current_indices : max_indices
    
           *
    
           * Note: The casting of data types is required to make the compiler happy
    
           * and does not change values.
    
           */
    
      98296
          *max_indices =
    
      393184
              _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
    
                                                   _mm256_castsi256_ps(*current_indices),
    
                                                   compare_mask));
    
          /* compute indices of complex numbers which will be loaded in the next iteration */
    
      98296
          *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
    
      98296
      }
    
      /* See _variant0 for details */
    
      98296
      static inline void vector_32fc_index_max_variant1(__m256 in0,
    
                                                        __m256 in1,
    
                                                        __m256* max_values,
    
                                                        __m256i* max_indices,
    
                                                        __m256i* current_indices,
    
                                                        __m256i indices_increment)
    
      {
    
      98296
          in0 = _mm256_mul_ps(in0, in0);
    
      98296
          in1 = _mm256_mul_ps(in1, in1);
    
      98296
          __m256 abs_squared = _mm256_hadd_ps(in0, in1);
    
      98296
          __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
    
          /*
    
           * This is the only line which differs from variant0. Using maxps instead of
    
           * blendvps is faster on Intel CPUs (on the ones tested with).
    
           *
    
           * Note: The order of arguments matters if a NaN is encountered in which
    
           * case the value of the second argument is selected. This is consistent
    
           * with the “ordered” comparision and the blend operation: The comparision
    
           * returns false if a NaN is encountered and the blend operation
    
           * consequently selects the value from max_indices.
    
           */
    
      98296
          *max_values = _mm256_max_ps(abs_squared, *max_values);
    
      98296
          *max_indices =
    
      393184
              _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
    
                                                   _mm256_castsi256_ps(*current_indices),
    
                                                   compare_mask));
    
      98296
          *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
    
      98296
      }
    
      /*
    
       * The function below vectorizes the inner loop of the following code:
    
       *
    
       * float min_values[8] = {FLT_MAX};
    
       * unsigned min_indices[8] = {0};
    
       * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    
       * for (unsigned i = 0; i < num_points / 8; ++i) {
    
       *     for (unsigned j = 0; j < 8; ++j) {
    
       *         float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
    
       *         bool compare = abs_squared < min_values[j];
    
       *         min_values[j] = compare ? abs_squared : min_values[j];
    
       *         min_indices[j] = compare ? current_indices[j] : min_indices[j]
    
       *         current_indices[j] += 8; // update for next outer loop iteration
    
       *         ++src0;
    
       *     }
    
       * }
    
       */
    
      98296
      static inline void vector_32fc_index_min_variant0(__m256 in0,
    
                                                        __m256 in1,
    
                                                        __m256* min_values,
    
                                                        __m256i* min_indices,
    
                                                        __m256i* current_indices,
    
                                                        __m256i indices_increment)
    
      {
    
      98296
          in0 = _mm256_mul_ps(in0, in0);
    
      98296
          in1 = _mm256_mul_ps(in1, in1);
    
          /*
    
           * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
    
           * hadd_ps(a, b) computes
    
           * (b_7 + b_6,
    
           *  b_5 + b_4,
    
           *  ---------
    
           *  a_7 + b_6,
    
           *  a_5 + a_4,
    
           *  ---------
    
           *  b_3 + b_2,
    
           *  b_1 + b_0,
    
           *  ---------
    
           *  a_3 + a_2,
    
           *  a_1 + a_0).
    
           * The result is the squared absolute value of complex numbers at index
    
           * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
    
           * current_indices!
    
           */
    
      98296
          __m256 abs_squared = _mm256_hadd_ps(in0, in1);
    
          /*
    
           * Compare the recently computed squared absolute values with the
    
           * previously determined minimum values. cmp_ps(a, b) determines
    
           * a < b ? 0xFFFFFFFF for each element in the vectors =>
    
           * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
    
           *
    
           * If either operand is NaN, 0 is returned as an “ordered” comparision is
    
           * used => the blend operation will select the value from *min_values.
    
           */
    
      98296
          __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
    
          /* Select minimum by blending. This is the only line which differs from variant1 */
    
      98296
          *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
    
          /*
    
           * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
    
           * each element in the vectors =>
    
           * min_indices = compare_mask ? current_indices : min_indices
    
           *
    
           * Note: The casting of data types is required to make the compiler happy
    
           * and does not change values.
    
           */
    
      98296
          *min_indices =
    
      393184
              _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
    
                                                   _mm256_castsi256_ps(*current_indices),
    
                                                   compare_mask));
    
          /* compute indices of complex numbers which will be loaded in the next iteration */
    
      98296
          *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
    
      98296
      }
    
      /* See _variant0 for details */
    
      98296
      static inline void vector_32fc_index_min_variant1(__m256 in0,
    
                                                        __m256 in1,
    
                                                        __m256* min_values,
    
                                                        __m256i* min_indices,
    
                                                        __m256i* current_indices,
    
                                                        __m256i indices_increment)
    
      {
    
      98296
          in0 = _mm256_mul_ps(in0, in0);
    
      98296
          in1 = _mm256_mul_ps(in1, in1);
    
      98296
          __m256 abs_squared = _mm256_hadd_ps(in0, in1);
    
      98296
          __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
    
          /*
    
           * This is the only line which differs from variant0. Using maxps instead of
    
           * blendvps is faster on Intel CPUs (on the ones tested with).
    
           *
    
           * Note: The order of arguments matters if a NaN is encountered in which
    
           * case the value of the second argument is selected. This is consistent
    
           * with the “ordered” comparision and the blend operation: The comparision
    
           * returns false if a NaN is encountered and the blend operation
    
           * consequently selects the value from min_indices.
    
           */
    
      98296
          *min_values = _mm256_min_ps(abs_squared, *min_values);
    
      98296
          *min_indices =
    
      393184
              _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
    
                                                   _mm256_castsi256_ps(*current_indices),
    
                                                   compare_mask));
    
      98296
          *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
    
      98296
      }
    
      #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */

Line	Exec	Source
1		/* -- c++ -- */
2		/*
3		* Copyright 2015 Free Software Foundation, Inc.
4		*
5		* This file is part of VOLK
6		*
7		* SPDX-License-Identifier: LGPL-3.0-or-later
8		*/
9
10		/*
11		* This file is intended to hold AVX2 intrinsics of intrinsics.
12		* They should be used in VOLK kernels to avoid copy-paste.
13		*/
14
15		#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
16		#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
17		#include "volk/volk_avx_intrinsics.h"
18		#include <immintrin.h>
19
20	4608	static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
21		{
22	4608	const __m128i zeros = _mm_set1_epi8(0x00);
23	4608	const __m128i sign_extract = _mm_set1_epi8(0x80);
24	4608	const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
25		0xff,
26		0xff,
27		0x00,
28		0xff,
29		0xff,
30		0xff,
31		0x01,
32		0xff,
33		0xff,
34		0xff,
35		0x02,
36		0xff,
37		0xff,
38		0xff,
39		0x03,
40		0xff,
41		0xff,
42		0xff,
43		0x04,
44		0xff,
45		0xff,
46		0xff,
47		0x05,
48		0xff,
49		0xff,
50		0xff,
51		0x06,
52		0xff,
53		0xff,
54		0xff,
55		0x07);
56	4608	__m256i sign_bits = _mm256_setzero_si256();
57
58	4608	fbits = _mm_cmpgt_epi8(fbits, zeros);
59	4608	fbits = _mm_and_si128(fbits, sign_extract);
60	4608	sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
61	4608	sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
62	4608	sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
63
64	4608	return _mm256_castsi256_ps(sign_bits);
65		}
66
67		static inline __m256
68	4608	_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
69		{
70		// prepare sign mask for correct +-
71	4608	__m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
72
73		__m256 llr0, llr1;
74	4608	_mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
75
76		// calculate result
77	4608	llr0 = _mm256_xor_ps(llr0, sign_mask);
78	4608	__m256 dst = _mm256_add_ps(llr0, llr1);
79	4608	return dst;
80		}
81
82	65532	static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
83		const __m256 cplxValue1)
84		{
85	65532	const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
86	65532	const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
87	65532	const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
88	65532	const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
89	65532	return _mm256_permutevar8x32_ps(complex_result, idx);
90		}
91
92	65532	static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
93		const __m256 symbols1,
94		const __m256 points0,
95		const __m256 points1,
96		const __m256 scalar)
97		{
98		/*
99		* Calculate: \|y - x\|^2 * SNR_lin
100		* Consider 'symbolsX' and 'pointsX' to be complex float
101		* 'symbolsX' are 'y' and 'pointsX' are 'x'
102		*/
103	65532	const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
104	65532	const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
105	65532	const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
106	65532	return _mm256_mul_ps(norms, scalar);
107		}
108
109		/*
110		* The function below vectorizes the inner loop of the following code:
111		*
112		* float max_values[8] = {0.f};
113		* unsigned max_indices[8] = {0};
114		* unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
115		* for (unsigned i = 0; i < num_points / 8; ++i) {
116		* for (unsigned j = 0; j < 8; ++j) {
117		* float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
118		* bool compare = abs_squared > max_values[j];
119		* max_values[j] = compare ? abs_squared : max_values[j];
120		* max_indices[j] = compare ? current_indices[j] : max_indices[j]
121		* current_indices[j] += 8; // update for next outer loop iteration
122		* ++src0;
123		* }
124		* }
125		*/
126	98296	static inline void vector_32fc_index_max_variant0(__m256 in0,
127		__m256 in1,
128		__m256* max_values,
129		__m256i* max_indices,
130		__m256i* current_indices,
131		__m256i indices_increment)
132		{
133	98296	in0 = _mm256_mul_ps(in0, in0);
134	98296	in1 = _mm256_mul_ps(in1, in1);
135
136		/*
137		* Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
138		* hadd_ps(a, b) computes
139		* (b_7 + b_6,
140		* b_5 + b_4,
141		* ---------
142		* a_7 + b_6,
143		* a_5 + a_4,
144		* ---------
145		* b_3 + b_2,
146		* b_1 + b_0,
147		* ---------
148		* a_3 + a_2,
149		* a_1 + a_0).
150		* The result is the squared absolute value of complex numbers at index
151		* offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
152		* current_indices!
153		*/
154	98296	__m256 abs_squared = _mm256_hadd_ps(in0, in1);
155
156		/*
157		* Compare the recently computed squared absolute values with the
158		* previously determined maximum values. cmp_ps(a, b) determines
159		* a > b ? 0xFFFFFFFF for each element in the vectors =>
160		* compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
161		*
162		* If either operand is NaN, 0 is returned as an “ordered” comparision is
163		* used => the blend operation will select the value from *max_values.
164		*/
165	98296	__m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
166
167		/* Select maximum by blending. This is the only line which differs from variant1 */
168	98296	max_values = _mm256_blendv_ps(max_values, abs_squared, compare_mask);
169
170		/*
171		* Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
172		* each element in the vectors =>
173		* max_indices = compare_mask ? current_indices : max_indices
174		*
175		* Note: The casting of data types is required to make the compiler happy
176		* and does not change values.
177		*/
178	98296	*max_indices =
179	393184	_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
180		_mm256_castsi256_ps(*current_indices),
181		compare_mask));
182
183		/* compute indices of complex numbers which will be loaded in the next iteration */
184	98296	current_indices = _mm256_add_epi32(current_indices, indices_increment);
185	98296	}
186
187		/* See _variant0 for details */
188	98296	static inline void vector_32fc_index_max_variant1(__m256 in0,
189		__m256 in1,
190		__m256* max_values,
191		__m256i* max_indices,
192		__m256i* current_indices,
193		__m256i indices_increment)
194		{
195	98296	in0 = _mm256_mul_ps(in0, in0);
196	98296	in1 = _mm256_mul_ps(in1, in1);
197
198	98296	__m256 abs_squared = _mm256_hadd_ps(in0, in1);
199	98296	__m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
200
201		/*
202		* This is the only line which differs from variant0. Using maxps instead of
203		* blendvps is faster on Intel CPUs (on the ones tested with).
204		*
205		* Note: The order of arguments matters if a NaN is encountered in which
206		* case the value of the second argument is selected. This is consistent
207		* with the “ordered” comparision and the blend operation: The comparision
208		* returns false if a NaN is encountered and the blend operation
209		* consequently selects the value from max_indices.
210		*/
211	98296	max_values = _mm256_max_ps(abs_squared, max_values);
212
213	98296	*max_indices =
214	393184	_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
215		_mm256_castsi256_ps(*current_indices),
216		compare_mask));
217
218	98296	current_indices = _mm256_add_epi32(current_indices, indices_increment);
219	98296	}
220
221		/*
222		* The function below vectorizes the inner loop of the following code:
223		*
224		* float min_values[8] = {FLT_MAX};
225		* unsigned min_indices[8] = {0};
226		* unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
227		* for (unsigned i = 0; i < num_points / 8; ++i) {
228		* for (unsigned j = 0; j < 8; ++j) {
229		* float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
230		* bool compare = abs_squared < min_values[j];
231		* min_values[j] = compare ? abs_squared : min_values[j];
232		* min_indices[j] = compare ? current_indices[j] : min_indices[j]
233		* current_indices[j] += 8; // update for next outer loop iteration
234		* ++src0;
235		* }
236		* }
237		*/
238	98296	static inline void vector_32fc_index_min_variant0(__m256 in0,
239		__m256 in1,
240		__m256* min_values,
241		__m256i* min_indices,
242		__m256i* current_indices,
243		__m256i indices_increment)
244		{
245	98296	in0 = _mm256_mul_ps(in0, in0);
246	98296	in1 = _mm256_mul_ps(in1, in1);
247
248		/*
249		* Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
250		* hadd_ps(a, b) computes
251		* (b_7 + b_6,
252		* b_5 + b_4,
253		* ---------
254		* a_7 + b_6,
255		* a_5 + a_4,
256		* ---------
257		* b_3 + b_2,
258		* b_1 + b_0,
259		* ---------
260		* a_3 + a_2,
261		* a_1 + a_0).
262		* The result is the squared absolute value of complex numbers at index
263		* offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
264		* current_indices!
265		*/
266	98296	__m256 abs_squared = _mm256_hadd_ps(in0, in1);
267
268		/*
269		* Compare the recently computed squared absolute values with the
270		* previously determined minimum values. cmp_ps(a, b) determines
271		* a < b ? 0xFFFFFFFF for each element in the vectors =>
272		* compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
273		*
274		* If either operand is NaN, 0 is returned as an “ordered” comparision is
275		* used => the blend operation will select the value from *min_values.
276		*/
277	98296	__m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
278
279		/* Select minimum by blending. This is the only line which differs from variant1 */
280	98296	min_values = _mm256_blendv_ps(min_values, abs_squared, compare_mask);
281
282		/*
283		* Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
284		* each element in the vectors =>
285		* min_indices = compare_mask ? current_indices : min_indices
286		*
287		* Note: The casting of data types is required to make the compiler happy
288		* and does not change values.
289		*/
290	98296	*min_indices =
291	393184	_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
292		_mm256_castsi256_ps(*current_indices),
293		compare_mask));
294
295		/* compute indices of complex numbers which will be loaded in the next iteration */
296	98296	current_indices = _mm256_add_epi32(current_indices, indices_increment);
297	98296	}
298
299		/* See _variant0 for details */
300	98296	static inline void vector_32fc_index_min_variant1(__m256 in0,
301		__m256 in1,
302		__m256* min_values,
303		__m256i* min_indices,
304		__m256i* current_indices,
305		__m256i indices_increment)
306		{
307	98296	in0 = _mm256_mul_ps(in0, in0);
308	98296	in1 = _mm256_mul_ps(in1, in1);
309
310	98296	__m256 abs_squared = _mm256_hadd_ps(in0, in1);
311	98296	__m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
312
313		/*
314		* This is the only line which differs from variant0. Using maxps instead of
315		* blendvps is faster on Intel CPUs (on the ones tested with).
316		*
317		* Note: The order of arguments matters if a NaN is encountered in which
318		* case the value of the second argument is selected. This is consistent
319		* with the “ordered” comparision and the blend operation: The comparision
320		* returns false if a NaN is encountered and the blend operation
321		* consequently selects the value from min_indices.
322		*/
323	98296	min_values = _mm256_min_ps(abs_squared, min_values);
324
325	98296	*min_indices =
326	393184	_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
327		_mm256_castsi256_ps(*current_indices),
328		compare_mask));
329
330	98296	current_indices = _mm256_add_epi32(current_indices, indices_increment);
331	98296	}
332
333		#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
334