//----------------------------------------------------------------------------
//
// blcutil - a binary linear code utility
//           copyright 2013 Scott Duplichan
//           This program is free software: you can redistribute it and/or modify
//           it under the terms of the GNU General Public License as published by
//           the Free Software Foundation, either version 3 of the License, or
//           (at your option) any later version.
//
//----------------------------------------------------------------------------
#include <CL/cl.h>
#include "project.h"

typedef struct
    {
    int              isAmdOcl;              // set if using AMD OpenCL platform
    int              useCpu;                // set if selected OpenCL device is a cpu
    size_t           maxWorkGroupSize;      // clGetDeviceInfo: CL_DEVICE_MAX_WORK_GROUP_SIZE
    cl_uint          maxComputeUnits;       // clGetDeviceInfo: CL_DEVICE_MAX_COMPUTE_UNITS
    char             deviceName [64];       // clGetDeviceInfo: CL_DEVICE_NAME
    cl_device_id     device;                // identifies the selected OpenCL device
    cl_context       context;               // identifies which OpenCL implementation to use
    cl_kernel        kernel;                // clCreateKernel result
    cl_program       program;               // clCreateProgramWithSource result
    cl_command_queue commandQueue;          // clCreateCommandQueue result
    }
OPENCL_INFO;

//----------------------------------------------------------------------------
// extractOpenClSourceFile - gets the OpenCL source code file that is embedded
//                           in the executable as a Windows resource

char *extractOpenClSourceFile (char **returnBuffer)
   {
   HRSRC handle1, handle2;
   char  *position;

   *returnBuffer = NULL;
   // look for our OpenCL source embedded as a resource
   handle1 = FindResource (NULL, MAKEINTRESOURCE (IDR_OPENCL_SOURCE1), "OPENCL_SOURCE");
   if (!handle1) return formatMessage ("FindResource: %s", winErrorText (0));
   handle2 = LoadResource (NULL, handle1);
   if (!handle2) return formatMessage ("LoadResource: %s", winErrorText (0));
   position = LockResource (handle2);
   if (!position) return formatMessage ("LockResource: %s", winErrorText (0));
   *returnBuffer = position;
   return NULL;
   }

//----------------------------------------------------------------------------
//
// xor a pair of extended integers
//
static void xorInteger (INTEGER *word1, INTEGER *word2, INTEGER *result, int activeBits)
   {
   int index;
   int count = activeBits / UINT64_BITS;

   for (index = 0; index < count; index += ELEMENT_BITS / UINT64_BITS)
      {
      result->uint64 [index + 0] = word1->uint64 [index + 0] ^ word2->uint64 [index + 0];
      result->uint64 [index + 1] = word1->uint64 [index + 1] ^ word2->uint64 [index + 1];
      result->uint64 [index + 2] = word1->uint64 [index + 2] ^ word2->uint64 [index + 2];
      result->uint64 [index + 3] = word1->uint64 [index + 3] ^ word2->uint64 [index + 3];
      }
   }

//----------------------------------------------------------------------------
// 
// generatecodeword - xor together selected generator rows to form a codeword
//
static void generatecodeword (INTEGER *generator, INTEGER *result, uint64_t rowSelect, int activeBits)
    {
    int bitNumber = 0;
    const INTEGER IntegerZero = {{0}};

    *result = IntegerZero;
    while (rowSelect)
        {
        if (rowSelect & 1) xorInteger (result, &generator [bitNumber], result, activeBits);
        bitNumber++;
        rowSelect >>= 1;
        }
    }

//----------------------------------------------------------------------------
// openclErrorText - return ascii text corresponding to an OpenCL error code

static char *openclErrorText (cl_int status)
    {
    if (status == CL_SUCCESS                                  ) return "CL_SUCCESS";
    if (status == CL_DEVICE_NOT_FOUND                         ) return "CL_DEVICE_NOT_FOUND";
    if (status == CL_DEVICE_NOT_AVAILABLE                     ) return "CL_DEVICE_NOT_AVAILABLE";
    if (status == CL_COMPILER_NOT_AVAILABLE                   ) return "CL_COMPILER_NOT_AVAILABLE";
    if (status == CL_MEM_OBJECT_ALLOCATION_FAILURE            ) return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
    if (status == CL_OUT_OF_RESOURCES                         ) return "CL_OUT_OF_RESOURCES";
    if (status == CL_OUT_OF_HOST_MEMORY                       ) return "CL_OUT_OF_HOST_MEMORY";
    if (status == CL_PROFILING_INFO_NOT_AVAILABLE             ) return "CL_PROFILING_INFO_NOT_AVAILABLE";
    if (status == CL_MEM_COPY_OVERLAP                         ) return "CL_MEM_COPY_OVERLAP";
    if (status == CL_IMAGE_FORMAT_MISMATCH                    ) return "CL_IMAGE_FORMAT_MISMATCH";
    if (status == CL_IMAGE_FORMAT_NOT_SUPPORTED               ) return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
    if (status == CL_BUILD_PROGRAM_FAILURE                    ) return "CL_BUILD_PROGRAM_FAILURE";
    if (status == CL_MAP_FAILURE                              ) return "CL_MAP_FAILURE";
    if (status == CL_MISALIGNED_SUB_BUFFER_OFFSET             ) return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
    if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
    if (status == CL_COMPILE_PROGRAM_FAILURE                  ) return "CL_COMPILE_PROGRAM_FAILURE";
    if (status == CL_LINKER_NOT_AVAILABLE                     ) return "CL_LINKER_NOT_AVAILABLE";
    if (status == CL_LINK_PROGRAM_FAILURE                     ) return "CL_LINK_PROGRAM_FAILURE";
    if (status == CL_DEVICE_PARTITION_FAILED                  ) return "CL_DEVICE_PARTITION_FAILED";
    if (status == CL_KERNEL_ARG_INFO_NOT_AVAILABLE            ) return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
    if (status == CL_INVALID_VALUE                            ) return "CL_INVALID_VALUE";
    if (status == CL_INVALID_DEVICE_TYPE                      ) return "CL_INVALID_DEVICE_TYPE";
    if (status == CL_INVALID_PLATFORM                         ) return "CL_INVALID_PLATFORM";
    if (status == CL_INVALID_DEVICE                           ) return "CL_INVALID_DEVICE";
    if (status == CL_INVALID_CONTEXT                          ) return "CL_INVALID_CONTEXT";
    if (status == CL_INVALID_QUEUE_PROPERTIES                 ) return "CL_INVALID_QUEUE_PROPERTIES";
    if (status == CL_INVALID_COMMAND_QUEUE                    ) return "CL_INVALID_COMMAND_QUEUE";
    if (status == CL_INVALID_HOST_PTR                         ) return "CL_INVALID_HOST_PTR";
    if (status == CL_INVALID_MEM_OBJECT                       ) return "CL_INVALID_MEM_OBJECT";
    if (status == CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          ) return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
    if (status == CL_INVALID_IMAGE_SIZE                       ) return "CL_INVALID_IMAGE_SIZE";
    if (status == CL_INVALID_SAMPLER                          ) return "CL_INVALID_SAMPLER";
    if (status == CL_INVALID_BINARY                           ) return "CL_INVALID_BINARY";
    if (status == CL_INVALID_BUILD_OPTIONS                    ) return "CL_INVALID_BUILD_OPTIONS";
    if (status == CL_INVALID_PROGRAM                          ) return "CL_INVALID_PROGRAM";
    if (status == CL_INVALID_PROGRAM_EXECUTABLE               ) return "CL_INVALID_PROGRAM_EXECUTABLE";
    if (status == CL_INVALID_KERNEL_NAME                      ) return "CL_INVALID_KERNEL_NAME";
    if (status == CL_INVALID_KERNEL_DEFINITION                ) return "CL_INVALID_KERNEL_DEFINITION";
    if (status == CL_INVALID_KERNEL                           ) return "CL_INVALID_KERNEL";
    if (status == CL_INVALID_ARG_INDEX                        ) return "CL_INVALID_ARG_INDEX";
    if (status == CL_INVALID_ARG_VALUE                        ) return "CL_INVALID_ARG_VALUE";
    if (status == CL_INVALID_ARG_SIZE                         ) return "CL_INVALID_ARG_SIZE";
    if (status == CL_INVALID_KERNEL_ARGS                      ) return "CL_INVALID_KERNEL_ARGS";
    if (status == CL_INVALID_WORK_DIMENSION                   ) return "CL_INVALID_WORK_DIMENSION";
    if (status == CL_INVALID_WORK_GROUP_SIZE                  ) return "CL_INVALID_WORK_GROUP_SIZE";
    if (status == CL_INVALID_WORK_ITEM_SIZE                   ) return "CL_INVALID_WORK_ITEM_SIZE";
    if (status == CL_INVALID_GLOBAL_OFFSET                    ) return "CL_INVALID_GLOBAL_OFFSET";
    if (status == CL_INVALID_EVENT_WAIT_LIST                  ) return "CL_INVALID_EVENT_WAIT_LIST";
    if (status == CL_INVALID_EVENT                            ) return "CL_INVALID_EVENT";
    if (status == CL_INVALID_OPERATION                        ) return "CL_INVALID_OPERATION";
    if (status == CL_INVALID_GL_OBJECT                        ) return "CL_INVALID_GL_OBJECT";
    if (status == CL_INVALID_BUFFER_SIZE                      ) return "CL_INVALID_BUFFER_SIZE";
    if (status == CL_INVALID_MIP_LEVEL                        ) return "CL_INVALID_MIP_LEVEL";
    if (status == CL_INVALID_GLOBAL_WORK_SIZE                 ) return "CL_INVALID_GLOBAL_WORK_SIZE";
    if (status == CL_INVALID_PROPERTY                         ) return "CL_INVALID_PROPERTY";
    if (status == CL_INVALID_IMAGE_DESCRIPTOR                 ) return "CL_INVALID_IMAGE_DESCRIPTOR";
    if (status == CL_INVALID_COMPILER_OPTIONS                 ) return "CL_INVALID_COMPILER_OPTIONS";
    if (status == CL_INVALID_LINKER_OPTIONS                   ) return "CL_INVALID_LINKER_OPTIONS";
    if (status == CL_INVALID_DEVICE_PARTITION_COUNT           ) return "CL_INVALID_DEVICE_PARTITION_COUNT";
    return formatMessage ("unknown error code (%d)", status);
    }

//----------------------------------------------------------------------------
// deviceTypeText - return opencl device type text corresponding to cl_device_type

static char *deviceTypeText (cl_device_type deviceType)
    {
    if (deviceType == CL_DEVICE_TYPE_CPU) return "cpu";
    if (deviceType == CL_DEVICE_TYPE_GPU) return "gpu";
    if (deviceType == CL_DEVICE_TYPE_ACCELERATOR) return "acc";
    return "oth";
    }

//----------------------------------------------------------------------------
// initOpenClDevice - opencl initialization and device selection

static char *initOpenClDevice (OPENCL_INFO *oclInfo, const char *sourceCode, int openClDevice, int verbose)
    {
    int                   platformIndex, deviceNumber;
    cl_int                status;
    cl_uint               platformCount;
    cl_platform_id        *platformIdList;
    cl_context_properties contextProperties [3];

    status = clGetPlatformIDs (0, NULL, &platformCount);
    if (status != CL_SUCCESS) return formatMessage ("clGetPlatformIDs: %s", openclErrorText (status));

    if (platformCount == 0) return "no OpenCL support found";
    platformIdList = malloc (platformCount * sizeof platformIdList [0]);
    if (!platformIdList) return "malloc fail";
    status = clGetPlatformIDs (platformCount, platformIdList, NULL);
    if (status != CL_SUCCESS) return formatMessage ("clGetPlatformIDs: %s", openclErrorText (status));

    deviceNumber = 1; // this number is for matching command line option 'opencl=n'
    for (platformIndex = 0; platformIndex < platformCount; platformIndex++)
        {
        int            deviceIndex;
        char           nameText [128];
        char           platformText [128];
        cl_uint        deviceCount;
        cl_device_id   *deviceIdList;
        cl_platform_id platform;
            
        platform = platformIdList [platformIndex];
        status = clGetPlatformInfo (platform, CL_PLATFORM_NAME, sizeof platformText, platformText, NULL);
        if (status != CL_SUCCESS) return formatMessage ("clGetPlatformInfo: %s", openclErrorText (status));
        platformText [sizeof platformText - 1] = '\0';
        if (verbose) printf ("OpenCL implementation %d: %s\n", platformIndex + 1, platformText);
        oclInfo->isAmdOcl = strstr (platformText, "AMD") != NULL;
        status = clGetDeviceIDs (platform, CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);	
        if (status != CL_SUCCESS) return formatMessage ("clGetDeviceIDs: %s", openclErrorText (status));
        deviceIdList = malloc (deviceCount * sizeof deviceIdList [0]);
        if (!deviceIdList) return "malloc fail";
        status = clGetDeviceIDs (platform, CL_DEVICE_TYPE_ALL, deviceCount, deviceIdList, NULL);
        if (status != CL_SUCCESS) return formatMessage ("clGetDeviceIDs: %s", openclErrorText (status));

        for (deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++, deviceNumber++)
            {
            cl_device_type deviceType;

            cl_device_id device = deviceIdList [deviceIndex];
            status = clGetDeviceInfo (device, CL_DEVICE_TYPE, sizeof deviceType, &deviceType, NULL);
            if (status != CL_SUCCESS) return formatMessage ("clGetDeviceInfo: %s", openclErrorText (status));
            status = clGetDeviceInfo (device, CL_DEVICE_NAME, sizeof nameText, nameText, NULL);
            if (status != CL_SUCCESS) return formatMessage ("clGetDeviceInfo: %s", openclErrorText (status));
            nameText [sizeof nameText - 1] = '\0';

            // if device number matches the one given on command line, use it
            if (openClDevice == deviceNumber)
                {
                oclInfo->device = device;
                strncpy (oclInfo->deviceName, nameText, sizeof oclInfo->deviceName - 1);
                contextProperties [0] = CL_CONTEXT_PLATFORM;
                contextProperties [1] = (intptr_t) platform;
                contextProperties [2] = 0;
                oclInfo->useCpu = deviceType == CL_DEVICE_TYPE_CPU;
                if (verbose) printf ("--->");
                }
            else if (verbose) printf ("    ");
            if (verbose) printf ("Device %d %s: %s\n", deviceIndex + 1, deviceTypeText (deviceType), skipWhiteSpace (nameText));
            }
        free (deviceIdList);
        }

    free (platformIdList);

    // if device number not given on command line, just show available opencl devices
    if (openClDevice == 0) return NULL;

    oclInfo->context = clCreateContext (contextProperties, 1, &oclInfo->device, NULL, NULL, &status);
    if (status != CL_SUCCESS) return formatMessage ("clCreateContext: %s", openclErrorText (status));

    oclInfo->commandQueue = clCreateCommandQueue (oclInfo->context, oclInfo->device, 0, &status);
    if (status != CL_SUCCESS) return formatMessage ("clCreateCommandQueue: %s", openclErrorText (status));

    // AMD opencl relies on the global path variable to find the JIT compiler.
    // Work around this problem so that there is no dependence on the system path variable.
    if (oclInfo->isAmdOcl)
        {
        char tempPath [_MAX_PATH];
        GetEnvironmentVariable ("AMDAPPSDKROOT", tempPath, sizeof tempPath - 7);
        strcat (tempPath, "bin\\x86_64");
        SetEnvironmentVariable ("path", tempPath);
        }

    status = clGetDeviceInfo (oclInfo->device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof oclInfo->maxWorkGroupSize, &oclInfo->maxWorkGroupSize, 0);
    if (status == CL_SUCCESS)status = clGetDeviceInfo (oclInfo->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof oclInfo->maxComputeUnits, &oclInfo->maxComputeUnits, 0);
    if (status != CL_SUCCESS) return formatMessage ("clGetDeviceInfo: %s", openclErrorText (status));

    oclInfo->program = clCreateProgramWithSource (oclInfo->context, 1, &sourceCode, NULL, &status);
    if (status != CL_SUCCESS) return formatMessage ("clCreateProgramWithSource: %s", openclErrorText (status));

    return NULL;
    }

//----------------------------------------------------------------------------
// timeKernel - measure execution time of an OpenCL clEnqueueNDRangeKernel invocation

static char *timeKernel (OPENCL_INFO *oclInfo, int workSizeBits, int workItemBits, uint64_t *milliseconds)
    {
    uint64_t start, elapsed;
    size_t   global_work_size [1];
    cl_int   status;

    global_work_size [0] = 1ull << workSizeBits;
    status = clSetKernelArg (oclInfo->kernel, 4, sizeof workItemBits, &workItemBits);
    if (status != CL_SUCCESS) return formatMessage ("clSetKernelArg: %s", openclErrorText (status));
    start = queryPerformanceCounter ();
    status = clEnqueueNDRangeKernel (oclInfo->commandQueue, oclInfo->kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
    if (status != CL_SUCCESS) return formatMessage ("clEnqueueNDRangeKernel: %s", openclErrorText (status));
    status = clFinish (oclInfo->commandQueue);
    if (status != CL_SUCCESS) return formatMessage ("clFinish: %s", openclErrorText (status));
    elapsed = queryPerformanceCounter () - start;
    *milliseconds = 1000 * elapsed / queryPerformanceFrequency ();
    return NULL;
    }

//----------------------------------------------------------------------------
// buildProgram - run the opencl JIT compile

static char *buildProgram (OPENCL_INFO *oclInfo, CODEINFO *codeInfo)
    {
    cl_int  status, buildStatus;
    size_t  buildLogSize = 0;
    char    *buildLog = NULL;
    char    options [400], *position = options;

    position += sprintf (position, "-DCODEPARMN=%d ", codeInfo->codeParmN);
    position += sprintf (position, "-DCODEPARMK=%d ", codeInfo->codeParmK);
    position += sprintf (position, "-DLOOKUP0COUNT=%d ", 1 << codeInfo->lookup0Bits);
    position += sprintf (position, "-DLOOKUP1COUNT=%d ", 1 << codeInfo->lookup1Bits);
    position += sprintf (position, "-DCPU_BUILD=%d ", oclInfo->useCpu);
    position += sprintf (position, "-DAMD_BUILD=%d ", oclInfo->isAmdOcl);
	if (oclInfo->isAmdOcl) strcat (options, "-save-temps ");
	if (oclInfo->isAmdOcl) strcat (options, "-Dcl_amd_media_ops ");
    if (codeInfo->verbose) printf ("OpenCL just-in-time compile...");
    buildStatus = clBuildProgram (oclInfo->program, 1, &oclInfo->device, options, NULL, NULL);
    if (codeInfo->verbose) printf ("\n");
    status = clGetProgramBuildInfo (oclInfo->program, oclInfo->device, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize);
    if (status != CL_SUCCESS) return formatMessage ("clGetProgramBuildInfo: %s", openclErrorText (status));

    buildLog = calloc (buildLogSize + 2, 1);
    if (!buildLog) return "malloc fail";

    status = clGetProgramBuildInfo (oclInfo->program, oclInfo->device, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL);
    if (status != CL_SUCCESS) return formatMessage ("clGetProgramBuildInfo: %s", openclErrorText (status));

    if (*buildLog)
        if (codeInfo->verbose || buildStatus != CL_SUCCESS)
            {
            printf ("------------------------\n");
            printf ("%s\n", buildLog);
            printf ("------------------------\n");
            }
    free (buildLog);
    if (buildStatus == CL_SUCCESS) return NULL;
    return formatMessage ("%s\n", openclErrorText (buildStatus));
    }

//----------------------------------------------------------------------------
// main entry point for OpenCL implementation of binary linear code weight distribution function

char *mainprog256_gpu (CODEINFO *codeInfo, int openClDevice)
    {
    cl_mem      generatorBuffer, outputBuffer, lookup0Buffer, lookup1Buffer;
    char        *source, *error;
    INTEGER     *generator, *lookup0, *lookup1;
    cl_uint     *countTable, *countTablePrevious;
    uint64_t    *weightTable;
    int         countTableSize, weightTableSize;
    int         index, minimumDistance, threads;
    int         loop, nonLookupBits, threadMax, threadBitMax, loopBitsMax;
    int         bestWorkSizeBits, bestLoopBits, smallestLoopBits;
    int         codeParmN, codeParmK, eccWeight;
    int         lookup0Size, lookup1Size, lookup0Count, lookup1Count, lookup0Bits, lookup1Bits;
    int         loopBits;       // number of K bits processed by each outer loop pass
    int         workSizeBits;
    int         threadBits;     // number of K bits processed by global_work_size
    size_t      global_work_size [1];
    uint64_t    totalCodewords, processedCodewords, expectedCodewords;
    uint64_t    start, elapsed;
    cl_int      workItemBits;   // number of K bits processed by a single work item
    cl_int      benchmarkWorkItemBits;
    cl_int	    status;
    cl_ulong    sectionStart;
    OPENCL_INFO *oclInfo;
    uint64_t    bestTime, bestMilliseconds = 0;

    oclInfo = calloc (1, sizeof oclInfo [0]);
    if (!oclInfo) return "malloc fail";
    error = extractOpenClSourceFile (&source);
    if (error) return error;

    error = initOpenClDevice (oclInfo, source, openClDevice, codeInfo->verbose);
    if (error) return error;

    // if device number not given on command line, just show available opencl devices
    if (openClDevice == 0) return NULL;

    codeParmN = codeInfo->codeParmN;
    codeParmK = codeInfo->codeParmK;
    generator = codeInfo->generator;

    // 32 max to ensure 32-bit subtotal counters cannot wrap
    loopBitsMax = 32;
    loopBits = codeParmK;
    if (loopBits > loopBitsMax) loopBits = loopBitsMax;

    // size for the two stage lookup tables
    lookup0Bits = 8;
    lookup1Bits = 4;

    // nonLookupBits determines work item loops. To limit the relative overhead
    // work item launch, make sure the work item does some minimum amount of processing.
    nonLookupBits = codeParmK - (lookup0Bits + lookup1Bits);
    if (nonLookupBits < 4)
        {
        lookup0Bits = codeParmK / 2 + 1;
        lookup1Bits = codeParmK / 2 - 1;
        nonLookupBits = codeParmK - (lookup0Bits + lookup1Bits);
        }

    // save final lookup sizes
    codeInfo->lookup0Bits = lookup0Bits;
    codeInfo->lookup1Bits = lookup1Bits;

    // If using AMD generated cpu code, problems result when work items
    // is more then 4X the core count. JUst limit to core count, because
    // more threads than cores doesn't help performance
    threadMax = 0x100; // initial limit
    if (oclInfo->useCpu) threadMax = oclInfo->maxComputeUnits;

    threadBitMax = highestSetBit64 (threadMax);
    threadBits = nonLookupBits;
    if (threadBits > threadBitMax) threadBits = threadBitMax;
    threads = 1 << threadBits;
    global_work_size [0] = threads;

    countTableSize = (codeParmN + 1) * sizeof countTable [0];
    weightTableSize = (codeParmN + 1) * sizeof weightTable [0];
    countTable = calloca (countTableSize, 64);
    countTablePrevious = calloca (countTableSize, 64);
    weightTable = calloca (weightTableSize, 64);

    // build lookup tables
    lookup0Count = 1 << lookup0Bits;
    lookup1Count = 1 << lookup1Bits;
    lookup0Size = lookup0Count * sizeof lookup0 [0];
    lookup1Size = lookup1Count * sizeof lookup1 [0];
    lookup0 = calloca (lookup0Size, 64);
    lookup1 = calloca (lookup1Size, 64);
    if (!countTable || !countTablePrevious || !lookup0 || !lookup1) return "malloc fail";

    // Precompute a lookup table of the xor of a portion of the generator rows.
    // This reduces the average xor argument count to close to two, rather than K/2.
    for (index = 0; index < lookup0Count; index++)
        generatecodeword (generator, &lookup0 [index], index << 0, MAXBITS);

    for (index = 0; index < lookup1Count; index++)
        generatecodeword (generator, &lookup1 [index], index << lookup0Bits, MAXBITS);

    outputBuffer    = clCreateBuffer (oclInfo->context, CL_MEM_WRITE_ONLY, countTableSize, NULL, NULL);
    generatorBuffer = clCreateBuffer (oclInfo->context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof generator [0] * codeParmK, generator, NULL);
    lookup0Buffer   = clCreateBuffer (oclInfo->context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR , lookup0Size, lookup0, NULL);
    lookup1Buffer   = clCreateBuffer (oclInfo->context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR , lookup1Size, lookup1, NULL);
    if (!outputBuffer || !generatorBuffer || !lookup0Buffer || !lookup1Buffer) return formatMessage ("clCreateBuffer fail");

    error = buildProgram (oclInfo, codeInfo);
    if (error) return error;

    // workItemBits value used when benchmark is skipped
    workItemBits = loopBits - threadBits;

    sectionStart = 0;

    oclInfo->kernel = clCreateKernel (oclInfo->program, "weightDistribution", &status);
    if (status != CL_SUCCESS) return formatMessage ("clCreateKernel: %s", openclErrorText (status));

    status = CL_SUCCESS;
    if (status == CL_SUCCESS) status = clSetKernelArg (oclInfo->kernel, 0, sizeof outputBuffer, &outputBuffer);
    if (status == CL_SUCCESS) status = clSetKernelArg (oclInfo->kernel, 1, sizeof generatorBuffer, &generatorBuffer);
    if (status == CL_SUCCESS) status = clSetKernelArg (oclInfo->kernel, 2, sizeof lookup0Buffer, &lookup0Buffer);
    if (status == CL_SUCCESS) status = clSetKernelArg (oclInfo->kernel, 3, sizeof lookup1Buffer, &lookup1Buffer);
    if (status == CL_SUCCESS) status = clSetKernelArg (oclInfo->kernel, 4, sizeof workItemBits, &workItemBits);
    if (status == CL_SUCCESS) status = clSetKernelArg (oclInfo->kernel, 5, sizeof sectionStart, &sectionStart);
    if (status != CL_SUCCESS) return formatMessage ("clSetKernelArg: %s", openclErrorText (status));

    //---------------------------------------------------------------------------

    benchmarkWorkItemBits = lookup0Bits + lookup1Bits + 4;

    // benchmark to find workSizeBits that maximizes kernels / second (workSize is like threads)
    if (!oclInfo->useCpu && benchmarkWorkItemBits <= workItemBits)
        {
        int workSizeBitLimit;
        if (codeInfo->verbose) printf ("determining simultaneous computation count (global_work_size)...");

        workItemBits = benchmarkWorkItemBits;
        bestTime = INT_MAX;
        bestWorkSizeBits = 4;
        workSizeBitLimit = 20;

        // work-around to avoid freeze when running on Llano GPU
        if (strstr (oclInfo->deviceName, "BeaverCreek")) workSizeBitLimit = 6;

        for (workSizeBits = 4; workSizeBits <= workSizeBitLimit; workSizeBits++)
            {
            uint64_t milliseconds;
            uint64_t nsPerKernel;

            error = timeKernel (oclInfo, workSizeBits, workItemBits, &milliseconds);
            if (error) return error;
            nsPerKernel = milliseconds * 1000000ull / (1ull << workSizeBits);
            if (bestTime <= (nsPerKernel * 100) / 100) break;
            if (milliseconds > 1000) break;
            bestTime = nsPerKernel;
            bestWorkSizeBits = workSizeBits;
            }
        workSizeBits = bestWorkSizeBits;
        global_work_size [0] = 1ull << workSizeBits;
        if (codeInfo->verbose) printf ("2^%d=%s\n", workSizeBits, powerOf2Text (workSizeBits));
    
        // benchmark to maximize workItemBits while keeping each opencl invocation time reasonable 
        if (codeInfo->verbose) printf ("determining codeword count for each computation...");
        bestTime = INT_MAX;
        smallestLoopBits = workSizeBits + lookup0Bits + lookup1Bits + 4;
        if (smallestLoopBits > codeParmK) smallestLoopBits = codeParmK;
        bestLoopBits = smallestLoopBits;
        for (loopBits = smallestLoopBits; loopBits < 50; loopBits++)
            {
            uint64_t milliseconds;
            uint64_t nsPerKernel;

            if (loopBits == codeParmK) break;
        
            workItemBits = loopBits - workSizeBits;
            error = timeKernel (oclInfo, workSizeBits, workItemBits, &milliseconds);
            if (error) return error;
            nsPerKernel = milliseconds * 1000000ull / (1ull << loopBits);
            if (bestTime < (nsPerKernel * 105) / 100) break;
            if (milliseconds > 1000) break;
            bestTime = nsPerKernel;
            bestLoopBits = loopBits;
            bestMilliseconds = milliseconds;
            }
        loopBits = bestLoopBits;
        workItemBits = loopBits - workSizeBits;
        if (lookup0Bits + lookup1Bits > workItemBits) return "lookup too big\n";
        if (codeInfo->verbose) printf ("2^%d=%s\n", workItemBits, powerOf2Text (workItemBits));
        status = clSetKernelArg (oclInfo->kernel, 4, sizeof workItemBits, &workItemBits);
        if (status != CL_SUCCESS) return formatMessage ("clSetKernelArg: %s", openclErrorText (status));
        }
    else if (codeInfo->verbose)
        {
        int globalWorkSizeBits = highestSetBit64 (global_work_size [0]);
        printf ("global_work_size: 2^%d=%s\n", globalWorkSizeBits, powerOf2Text (globalWorkSizeBits));
        printf ("work item size  : 2^%d=%s\n", workItemBits, powerOf2Text (workItemBits));
        }
    printf ("[%d, %d, ?]\n", codeInfo->codeParmN, codeInfo->codeParmK);
    if (bestMilliseconds)
        {
        uint64_t estimate = bestMilliseconds;
        estimate *= 1ull << (codeParmK - loopBits);
        estimate /= 1000;
        if (estimate > 10) printf ("estimated running time: %s\n", timeEstimateText (estimate));
        }



    if (codeInfo->verbose) printf ("running...\n");
    start = queryPerformanceCounter ();

    // initialize shapshot of countTable
    status = clEnqueueReadBuffer (oclInfo->commandQueue, outputBuffer, CL_TRUE, 0, countTableSize, countTablePrevious, 0, NULL, NULL);
    if (status != CL_SUCCESS) return formatMessage ("clEnqueueReadBuffer: %s", openclErrorText (status));

    minimumDistance = INT_MAX;
    expectedCodewords = 0;
    loop = 0;
    totalCodewords = 1ull << codeParmK;
    for (sectionStart = 0; sectionStart < totalCodewords; sectionStart += 1ull << loopBits, loop++)
        {
        status = clSetKernelArg (oclInfo->kernel, 5, sizeof sectionStart, &sectionStart);
        if (status != CL_SUCCESS) return formatMessage ("clSetKernelArg: %s", openclErrorText (status));
        status = clEnqueueNDRangeKernel (oclInfo->commandQueue, oclInfo->kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
        if (status != CL_SUCCESS) return formatMessage ("clEnqueueNDRangeKernel: %s", openclErrorText (status));

        status = clEnqueueReadBuffer (oclInfo->commandQueue, outputBuffer, CL_TRUE, 0, countTableSize, countTable, 0, NULL, NULL);
        if (status != CL_SUCCESS) return formatMessage ("clEnqueueReadBuffer: %s", openclErrorText (status));

        processedCodewords = 0;
        expectedCodewords = 1ull << loopBits;
        for (eccWeight = 0; eccWeight <= codeParmN; eccWeight++)
            {
            uint32_t delta = countTable [eccWeight] - countTablePrevious [eccWeight];
            if (delta == 0) continue;
            countTablePrevious [eccWeight] = countTable [eccWeight];
            weightTable [eccWeight] += delta;
            processedCodewords += delta;
            if (eccWeight && !minimumDistance) minimumDistance = eccWeight;
            if (eccWeight == 0) continue;
            if (minimumDistance <= eccWeight) continue;
            minimumDistance = eccWeight;
            printf ("%d ", minimumDistance);
            }
        
        if (processedCodewords != expectedCodewords)
            {
            printf ("\nloop %X: processed only %llX of %llX codewords\n", loop, processedCodewords, expectedCodewords);
            //return 1;
            }
        printf (".");
        }

    elapsed = queryPerformanceCounter () - start;
    processedCodewords = 0;
    minimumDistance = 0;
    printf ("\n");
    for (eccWeight = 0; eccWeight <= codeParmN; eccWeight++)
        {
        uint64_t entry = weightTable [eccWeight];
        if (entry == 0) continue;
        if (eccWeight && !minimumDistance) minimumDistance = eccWeight;
        processedCodewords += entry;
        printf ("%-7d %llu\n", eccWeight, entry);
        }

    if (processedCodewords != totalCodewords)
        printf ("----- expected %llX code words, but processed %llX -----\n", totalCodewords, processedCodewords);

    printf ("[%d, %d, %d]\n", codeParmN, codeParmK, minimumDistance);
    printf ("elapsed time: %s\n", elapsedTimeText ((double) elapsed / queryPerformanceFrequency ()));

    //---------------------------------------------------------------------------

    status = CL_SUCCESS;
    if (status == CL_SUCCESS) status = clReleaseKernel (oclInfo->kernel);
    if (status == CL_SUCCESS) status = clReleaseProgram (oclInfo->program);
    if (status == CL_SUCCESS) status = clReleaseMemObject (generatorBuffer);
    if (status == CL_SUCCESS) status = clReleaseMemObject (lookup0Buffer);
    if (status == CL_SUCCESS) status = clReleaseMemObject (lookup1Buffer);
    if (status == CL_SUCCESS) status = clReleaseMemObject (outputBuffer);
    if (status == CL_SUCCESS) status = clReleaseCommandQueue (oclInfo->commandQueue);
    if (status == CL_SUCCESS) status = clReleaseContext (oclInfo->context);
    if (status != CL_SUCCESS) return formatMessage ("opencl cleanup: %s", openclErrorText (status));

    freea (countTable);
    freea (countTablePrevious);
    freea (weightTable);
    freea (lookup0);
    freea (lookup1);

    return 0;
    }

//----------------------------------------------------------------------------
