//----------------------------------------------------------------------------
//
// blcutil - a binary linear code utility
//           copyright 2013 Scott Duplichan
//           This program is free software: you can redistribute it and/or modify
//           it under the terms of the GNU General Public License as published by
//           the Free Software Foundation, either version 3 of the License, or
//           (at your option) any later version.
//
//----------------------------------------------------------------------------
//
// xor a pair of extended integers
//
static void xorInteger (INTEGER *word1, INTEGER *word2, INTEGER *result, int activeBits)
   {
   int index;
   int count = activeBits / UINT128_BITS;

   for (index = 0; index < count; index += ELEMENT_BITS / UINT128_BITS)
      {
      result->m128i [index + 0] = _mm_xor_si128 (word1->m128i [index + 0], word2->m128i [index + 0]);
      result->m128i [index + 1] = _mm_xor_si128 (word1->m128i [index + 1], word2->m128i [index + 1]);
      }
   }

//----------------------------------------------------------------------------
//
// popcntToBytes256 - find population count for 16-bit groups in xmm1:xmm0 (16 groups)
//                    each byte of xmm result contains a value ranging from 0 to 16
//
static __m128i popcntToBytes256 (__m128i xmm0, __m128i xmm1)
   {
	const __m128i mask4 = _mm_set1_epi8 (0x0F);
	const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low0, low1, high0, high1, count0, count1;
   
   low0 = _mm_and_si128 (mask4, xmm0);
   high0 = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm0, 4));
   count0 = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low0), _mm_shuffle_epi8 (lookup, high0));

   low1 = _mm_and_si128 (mask4, xmm1);
   high1 = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm1, 4));
   count1 = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low1), _mm_shuffle_epi8 (lookup, high1));

   return _mm_add_epi8 (count0, count1);
   }

//----------------------------------------------------------------------------
//
// horizontalAddBytes - return integer total all 16 bytes in xmm argument
//
static int horizontalAddBytes (__m128i byteSums)
   {
   __m128i total;
   const __m128i zero = _mm_setzero_si128 ();

   total = _mm_sad_epu8 (byteSums, zero);
   return _mm_cvtsi128_si64 (_mm_add_epi32 (total, _mm_shuffle_epi32 (total, 0xAA)));
   }

//----------------------------------------------------------------------------
// Returns population count of xor of a 256-bit extended integer pair. By
// combining the XOR and popcnt operations, the need to store a temporary
// XOR result in memory is eliminated.

static int popCntIntegerPair256 (INTEGER *word1, INTEGER *word2)
   {
   __m128i xmm0, xmm1, byteSums;

   xmm0 = _mm_xor_si128 (word1->m128i [0], word2->m128i [0]);
   xmm1 = _mm_xor_si128 (word1->m128i [1], word2->m128i [1]);
   byteSums = popcntToBytes256 (xmm0, xmm1);
   return horizontalAddBytes (byteSums);
   }

//----------------------------------------------------------------------------
// Returns population count of xor of a 512-bit extended integer pair. By
// combining the XOR and popcnt operations, the need to store a temporary
// XOR result in memory is eliminated.

static int popCntIntegerPair512 (INTEGER *word1, INTEGER *word2)
   {
   __m128i xmm0, xmm1, xmm2, xmm3, byteSums0, byteSums1, total;

   xmm0 = _mm_xor_si128 (word1->m128i [0], word2->m128i [0]);
   xmm1 = _mm_xor_si128 (word1->m128i [1], word2->m128i [1]);
   xmm2 = _mm_xor_si128 (word1->m128i [2], word2->m128i [2]);
   xmm3 = _mm_xor_si128 (word1->m128i [3], word2->m128i [3]);
   byteSums0 = popcntToBytes256 (xmm0, xmm1);
   byteSums1 = popcntToBytes256 (xmm2, xmm3);
   total = _mm_add_epi8 (byteSums0, byteSums1);
   return horizontalAddBytes (total);
   }

//----------------------------------------------------------------------------
// Returns population count of xor of an extended integer pair. By
// combining the XOR and popcnt operations, the need to store a temporary
// XOR result in memory is eliminated.

static int popCntIntegerPairActive (INTEGER *word1, INTEGER *word2, int activeBits)
   {
   int index, grandTotal = 0, additionCount = 0;
   int count = activeBits / UINT128_BITS;
   __m128i total = _mm_setzero_si128 ();

   for (index = 0; index < count; index += ELEMENT_BITS / UINT128_BITS)
      {
      __m128i byteSums, xmm0, xmm1;
      
      xmm0 = _mm_xor_si128 (word1->m128i [index + 0], word2->m128i [index + 0]);
      xmm1 = _mm_xor_si128 (word1->m128i [index + 1], word2->m128i [index + 1]);
      byteSums = popcntToBytes256 (xmm0, xmm1);
      total = _mm_add_epi8 (total, byteSums);

      // Each of the 16 byte-wide sum registers holds the bit count for a
      // 16-bit field. Accumulation is safe from rollover for 255/16=15 passes.
      // To prevent rollover, flush the totals after every 15 adds.
      if (++additionCount == 15)
         {
         additionCount = 0;
         grandTotal += horizontalAddBytes (total);
         total = _mm_setzero_si128 ();
         }
      }
   grandTotal += horizontalAddBytes (total);
   return grandTotal;
   }

//----------------------------------------------------------------------------
//
// popCntIntegerPair - return population count of xor of extended integer pair
//
static int popCntIntegerPair (INTEGER *word1, INTEGER *word2, int activeBits)
   {
   if (activeBits == 256)
      return popCntIntegerPair256 (word1, word2);
   else if (activeBits == 512)
      return popCntIntegerPair512 (word1, word2);
   else
      return popCntIntegerPairActive (word1, word2, activeBits);
   }

//----------------------------------------------------------------------------
