//----------------------------------------------------------------------------
//
// blcutil - a binary linear code utility
//           copyright 2013 Scott Duplichan
//           This program is free software: you can redistribute it and/or modify
//           it under the terms of the GNU General Public License as published by
//           the Free Software Foundation, either version 3 of the License, or
//           (at your option) any later version.
//
//----------------------------------------------------------------------------

void memoryFence (void)
    {
    barrier (CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    }

//----------------------------------------------------------------------------
// because opencl 1.0 lacks atomic functions, we cannot use it
#if (__OPENCL_VERSION__ == 100)
#error 'opencl version 1.0 cannot run this application'
#endif
//----------------------------------------------------------------------------

#if (__OPENCL_VERSION__ < 120)
// older opencl versions lack the popcount function so we must provide one 
//----------------------------------------------------------------------------
//
// popcnt256 - return population count for 256-bit value
//
uint popcnt256 (ulong4 vreg)
   {
   const ulong4 m1  = (ulong4)(0x5555555555555555,0x5555555555555555,0x5555555555555555,0x5555555555555555);
   const ulong4 m2  = (ulong4)(0x3333333333333333,0x3333333333333333,0x3333333333333333,0x3333333333333333);
   const ulong4 m4  = (ulong4)(0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f);
   const ulong4 h01 = (ulong4)(0x0101010101010101,0x0101010101010101,0x0101010101010101,0x0101010101010101);

   vreg -= (vreg >> 1) & m1;
   vreg = (vreg & m2) + ((vreg >> 2) & m2);
   vreg = (vreg + (vreg >> 4)) & m4;
   vreg = (vreg * h01) >> 56;
   return vreg.s0 + vreg.s1 + vreg.s2 + vreg.s3;
   }
#else // use opencl 1.2 defined 'popcount' function so that the native instruction can be used
//----------------------------------------------------------------------------
//
// popcnt256 - return population count for 256-bit value
//
uint popcnt256 (ulong4 vreg)
   {
   ulong4 counts = popcount (vreg);
   return counts.s0 + counts.s1 + counts.s2 + counts.s3;
   }
#endif
//----------------------------------------------------------------------------
// 
// generatecodeword - xor together selected generator rows to form a codeword
//
ulong4 generatecodeword (__constant ulong4 *generator, ulong rowSelect, int rows)
    {
    ulong4 result = 0;
    int bitNumber = 0;
    int index;

    for (index = 0; index < rows; index++)
        {
        if (rowSelect & 1) result ^= generator [bitNumber];
        bitNumber++;
        rowSelect >>= 1;
        }
    return result;
    }

//----------------------------------------------------------------------------

__kernel void weightDistribution (__global uint *weightTable,   // codeword weight counters (shared)
                                  __constant ulong4* generator, // array of k generator rows, 256 bits each
                                  __constant ulong4* lookup0,   // lookup table for lower bits
                                  __constant ulong4* lookup1,   // lookup table for mid bits
                                  int workItemBits,             // number og K bits to process
                                  ulong sectionStart            // work group data pattern start value
                                  )
    {
    ulong data, sectionSize, dataStart, dataStop;
    int index0, index1, globalId, localId;
    int codeParmN = CODEPARMN;
    int codeParmK = CODEPARMK;
    int lookup0Count = LOOKUP0COUNT;
    int lookup1Count = LOOKUP1COUNT;
    volatile __local uint subtotal [CODEPARMN + 1];

	globalId = get_global_id (2) * get_global_size (1) + get_global_id (1) * get_global_size (0) + get_global_id (0);
	localId = get_local_id (2) * get_local_size (1) + get_local_id (1) * get_local_size (0) + get_local_id (0);
	sectionSize = (ulong)1 << workItemBits;
	dataStart = sectionStart + globalId * sectionSize;
	dataStop = dataStart + sectionSize;

    // clear the subtotals
    for (index0 = 0; index0 < codeParmN + 1; index0++) subtotal [index0] = 0;
    memoryFence ();

    // the outer loop handles generator rows not covered by the lookup tables
    for (data = dataStart; data < dataStop; data += lookup0Count * lookup1Count)
        {
        ulong4 codewordUpper;

        // xor together generator rows corresponding to bits outside the look tables
        codewordUpper = generatecodeword (generator, data, codeParmK);

        // inner loop: combine this result with each lookup table entry
        for (index1 = 0; index1 < lookup1Count; index1++)
            {
            ulong4 codewordMid;
            codewordMid = codewordUpper ^ lookup1 [index1];

            for (index0 = 0; index0 < lookup0Count; index0++)
                {
                uint bits;
                bits = popcnt256 (lookup0 [index0] ^ codewordMid);
                atomic_inc (&subtotal [bits]);
                }
            }
        }
    // update the global weight table
    memoryFence ();
    if (localId == 0)
        {
        for (index0 = 0; index0 < codeParmN + 1; index0++)
            atomic_add (&weightTable [index0], subtotal [index0]);
        }
    }

//----------------------------------------------------------------------------
