//----------------------------------------------------------------------------
//
// blcutil - a binary linear code utility
//           copyright 2013 Scott Duplichan
//           This program is free software: you can redistribute it and/or modify
//           it under the terms of the GNU General Public License as published by
//           the Free Software Foundation, either version 3 of the License, or
//           (at your option) any later version.
//
//----------------------------------------------------------------------------
//
// xor a pair of extended integers
//
static void xorInteger (INTEGER *word1, INTEGER *word2, INTEGER *result, int activeBits)
   {
   int index;
   int count = activeBits / UINT256_BITS;

   for (index = 0; index < count; index += ELEMENT_BITS / UINT256_BITS)
      result->m256i [index] = _mm256_xor_si256 (word1->m256i [index], word2->m256i [index]);
   }

//----------------------------------------------------------------------------
//
// popcntToBytes256 - find population count for 8-bit groups in ymm (32 groups)
//                    each byte of ymm result contains a value ranging from 0 to 8
//
static __m256i popcntToBytes256 (__m256i ymm)
   {
	const __m256i mask4 = _mm256_set1_epi8 (0x0F);
	const __m256i lookup = _mm256_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, \
	                                         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m256i low, high;
   
   low   = _mm256_and_si256 (mask4, ymm);
   high  = _mm256_srli_epi16 (ymm, 4);
   high  = _mm256_and_si256 (mask4, high);
   low   = _mm256_shuffle_epi8 (lookup, low);
   high  = _mm256_shuffle_epi8 (lookup, high);
   return  _mm256_add_epi8 (low, high);
   }

//----------------------------------------------------------------------------
//
// horizontalAddBytes - return integer total all 32 bytes in ymm argument
//
static int horizontalAddBytes (__m256i byteSums)
   {
   const __m256i zero = _mm256_setzero_si256 ();
   __m256i total, temp;
   __m128i xmm;

   total = _mm256_sad_epu8 (byteSums, zero);
   temp  = _mm256_permute4x64_epi64 (total, 0xEE);
   total = _mm256_add_epi32 (total, temp);
   temp  = _mm256_srli_si256 (total, 8);
   total = _mm256_add_epi32 (total,temp);
   xmm   = _mm256_castsi256_si128 (total);
   return  _mm_cvtsi128_si32 (xmm);
   }

//----------------------------------------------------------------------------
// Returns population count of xor of a 256-bit extended integer pair. By
// combining the XOR and popcnt operations, the need to store a temporary
// XOR result in memory is eliminated.

static int popCntIntegerPair256 (INTEGER *word1, INTEGER *word2)
   {
   __m256i ymm, byteSums;

   ymm = _mm256_xor_si256 (word1->m256i [0], word2->m256i [0]);
   byteSums = popcntToBytes256 (ymm);
   return horizontalAddBytes (byteSums);
   }

//----------------------------------------------------------------------------
// Returns population count of xor of a 512-bit extended integer pair. By
// combining the XOR and popcnt operations, the need to store a temporary
// XOR result in memory is eliminated.

static int popCntIntegerPair512 (INTEGER *word1, INTEGER *word2)
   {
   __m256i ymm0, ymm1, byteSums0, byteSums1, total;

   ymm0 = _mm256_xor_si256 (word1->m256i [0], word2->m256i [0]);
   ymm1 = _mm256_xor_si256 (word1->m256i [1], word2->m256i [1]);
   byteSums0 = popcntToBytes256 (ymm0);
   byteSums1 = popcntToBytes256 (ymm1);
   total = _mm256_add_epi8 (byteSums0, byteSums1);
   return horizontalAddBytes (total);
   }

//----------------------------------------------------------------------------
// Returns population count of xor of an extended integer pair. By
// combining the XOR and popcnt operations, the need to store a temporary
// XOR result in memory is eliminated.

static int popCntIntegerPairActive (INTEGER *word1, INTEGER *word2, int activeBits)
   {
   int index, grandTotal = 0, additionCount = 0;
   int count = activeBits / UINT256_BITS;
   __m256i total = _mm256_setzero_si256 ();

   for (index = 0; index < count; index += ELEMENT_BITS / UINT256_BITS)
      {
      __m256i byteSums, ymm;
      
      ymm = _mm256_xor_si256 (word1->m256i [index], word2->m256i [index]);
      byteSums = popcntToBytes256 (ymm);
      total = _mm256_add_epi8 (total, byteSums);

      // Each of the 32 byte-wide sum registers holds the bit count for a
      // 8-bit field. Accumulation is safe from rollover for 255/8=31 passes.
      // To prevent rollover, flush the totals after every 31 adds.
      if (++additionCount == 31)
         {
         additionCount = 0;
         grandTotal += horizontalAddBytes (total);
         total = _mm256_setzero_si256 ();
         }
      }
   grandTotal += horizontalAddBytes (total);
   return grandTotal;
   }

//----------------------------------------------------------------------------
//
// popCntIntegerPair - return population count of xor of extended integer pair
//
static int popCntIntegerPair (INTEGER *word1, INTEGER *word2, int activeBits)
   {
   if (activeBits == 256)
      return popCntIntegerPair256 (word1, word2);
   else if (activeBits == 512)
      return popCntIntegerPair512 (word1, word2);
   else
      return popCntIntegerPairActive (word1, word2, activeBits);
   }

//----------------------------------------------------------------------------
