//----------------------------------------------------------------------------
//
// blcutil - a binary linear code utility
//           copyright 2013 Scott Duplichan
//           This program is free software: you can redistribute it and/or modify
//           it under the terms of the GNU General Public License as published by
//           the Free Software Foundation, either version 3 of the License, or
//           (at your option) any later version.
//
// Functions:
//   tabulate weight distribution
//      The result is obtained by generating all 2^K codewords of an [N, K, D] code.
//      Generator maximums: N - 65536  (set by defined symbol MAXBITS)
//                          K - 64     (bit size of C99 uint64_t)
//
//----------------------------------------------------------------------------
#include "project.h"
//----------------------------------------------------------------------------
// This file is compiled multiple times so that each algorithm can be compiled
// using different code generation options and still benefit from whole program
// optimization without the need for gcc link time optimization. Using gcc link
// time optimization (-flto) with different code generation options causes
// unsupported processor instructions to be included in functions that need to
// execute on any processor.

#if defined (GPR_BUILD)
   #define mainProg MAIN_PROG_NAME (gpr)
   #include "extendedInteger/popcnt_gpr.c"
#elif defined (XMMPOP_BUILD)
   #define mainProg MAIN_PROG_NAME (xmmpop)
   #include "extendedInteger/popcnt_xmmpop.c"
#elif defined (YMMPOP_BUILD)
   #define mainProg MAIN_PROG_NAME (ymmpop)
   #include "extendedInteger/popcnt_ymmpop.c"
#elif defined (SSE_BUILD)
   #define mainProg MAIN_PROG_NAME (sse)
   #include "extendedInteger/popcnt_sse.c"
#elif defined (AVX_BUILD)
   #define mainProg MAIN_PROG_NAME (avx)
   #include "extendedInteger/popcnt_sse.c"
#elif defined (AVX2_BUILD)
   #define mainProg MAIN_PROG_NAME (avx2)
   #include "extendedInteger/popcnt_avx2.c"
#endif

//----------------------------------------------------------------------------
//
// isBaseN - returns non-zero if the ascii digit is valid for the base (2,8,16)
//
static int isBaseN (int asciiDigit, int base)
   {
   int value;
   if (asciiDigit < '0') return 0;
   asciiDigit = toupper (asciiDigit);
   value = asciiDigit - '0';
   if (value < base) return 1; // completes check for all except hex A-F
   if (base == 16) return asciiDigit >= 'A' && asciiDigit <= 'F';
   return 0;
   }

//----------------------------------------------------------------------------
//
// isDigitSeparator - Return non-zero if the character is a valid digit
//                    separater for generator files. No separater is
//                    allowed for raw mode generator file.
//
static int isDigitSeparator (int ch, int raw)
   {
   if (raw) return 0;
   if (ch == ' ' ) return 1;
   if (ch == '\t') return 1;
   if (ch == ',' ) return 1;
   return 0;
   }

//----------------------------------------------------------------------------
//
// isEolChar - return non-zero if the character is an end of line character
//
static int isEolChar (int ch)
   {
   if (ch == '\n'  ) return 1;  // new line
   if (ch == '\r'  ) return 1;  // carriage return (Windows)
   if (ch == '\x1A') return 1;  // <Ctrl>Z end of file (Windows)
   return 0;
   }

//----------------------------------------------------------------------------
//
// scanDigits - Read an extended integer from an ascii buffer of digits.
//              If raw mode is set, then no digit separater is accepted.
//              Accepts base 2, 8, 16
//
static char *scanDigits (char *buffer, INTEGER *codeword, int base, int raw)
   {
   char *startOfNumber, *position;
   int bitno, index, bitsPerDigit;
   const INTEGER IntegerZero = {{0}};

   // save start of generator row
   startOfNumber = position = buffer;

   // find end of this generator row
   while (isBaseN (*position, base) || isDigitSeparator (*position, raw)) position++;
   position--;

   *codeword = IntegerZero;
   bitsPerDigit = highestSetBit64 (base);
   bitno = 0;
   while (position >= startOfNumber)
      {
      int value;
      // skip optional separaters
      while (isDigitSeparator (*position, raw)) position--;
      value = toupper (*position);
      if (value >= 'A') value = 10 + (value - 'A');
      else value -= '0';
      for (index = 0; index < bitsPerDigit; index++)
         {
         if (value & 1)
            {
            if (bitno >= MAXBITS)
               {
               int requiredBits = roundUp (bitno, ELEMENT_BITS);
               return formatMessage ("(increase MAXBITS to %d and recompile)", requiredBits);
               }
            setBit (codeword, bitno);
            }
         bitno++;
         value >>= 1;
         }
      position--;
      }
   return NULL;
   }

//----------------------------------------------------------------------------
// 
// findBase - looks at ascii buffer and finds the smallest common (2, 8, 16)
//            base that could intended
//
static int findBase (char *position)
   {
   int base, maxCharacter = '0';

   while (isalnum (*position))
      {
      if (*position != '-') // optional separator
         {
         if (*position < '0')
            {
            printf ("character '%c' does not belong to a supported number system\n", *position);
            exit (1);
            }
         if (maxCharacter < *position) maxCharacter = *position;
         }
      position++;
      }

   maxCharacter = toupper (maxCharacter);
   if (maxCharacter < '2') base = 2;
   else if (maxCharacter < '8') base = 8;
   else if (maxCharacter < 'G') base = 16;
   else
      {
      printf ("character '%c' does not belong to a supported number system\n", maxCharacter);
      exit (1);
      }
   return base;
   }

//----------------------------------------------------------------------------
//
// findMagmaGapGenerator - Check the passed buffer for a magma or GAP formatted
//                         generator. If found, skip header items and return the
//                         address of first generator row.
//
static char *findMagmaGapGenerator (char *buffer)
   {
   char *position;

   // if a magma generator dump, skip the header lines
   position = strstr (buffer, "Generator matrix:");
   if (position) return position += 17;

   // if a magma generator input, skip the header lines
   position = strchr (buffer, '|');
   if (position) return position + 1;

   // if a GAP generator dump, skip the header lines
   position = strstr (buffer, "GeneratorMatCode([");
   if (position) return position + 18;

   return NULL;
   }

//----------------------------------------------------------------------------
//
// readMagmaGapGenerator - read generator from Magma or GAP formatted file
//
static char *readMagmaGapGenerator (CODEINFO *codeInfo, char *buffer)
   {
   char *error, *position;
   int index, filelength;

   // end of line characters are not used, replace then with spaces
   filelength = strlen (buffer);
   for (index = 0; index < filelength; index++)
      if (isEolChar (buffer [index])) buffer [index] = ' ';

   // initial generator allocation, will grow as rows are read
   codeInfo->codeParmK = 0;
   codeInfo->generator = calloca (sizeof codeInfo->generator [0] * (codeInfo->codeParmK + 1), INTEGER_ALIGNMENT);
   if (!codeInfo) return "memory allocation failed";

   // read each generator row
   position = buffer;
   for (;;)
      {
      int iniWordBits;
      char *startOfRow, *endOfRow;
      INTEGER codeword;

      // stop if no more generator rows
      position = skipWhiteSpace (position);
      if (*position != '[') break;

      // skip opening bracket, find closing bracket
      startOfRow = position + 1;
      endOfRow = strchr (startOfRow, ']');
      if (!endOfRow) return formatMessage ("generator row %d missing ']'", codeInfo->codeParmK);

      // mark the end of this row
      *endOfRow = '\0';

      // If the row has any < or > characters, then it is a weight distribution
      // and not a generator row.
      if (strchr (startOfRow, '<')) break;

      error = scanDigits (startOfRow, &codeword, 2, 0);
      if (error) return error;

      // add the codeword to the generator
      codeInfo->generator [codeInfo->codeParmK++] = codeword;
      codeInfo->generator = realloca (codeInfo->generator, sizeof codeInfo->generator [0] * (codeInfo->codeParmK + 1), INTEGER_ALIGNMENT);
      if (!codeInfo->generator) return "memory allocation failed";

      // track the highest bit number to find N
      iniWordBits = highestSetBit (&codeword, MAXBITS) + 1;
      if (codeInfo->codeParmN < iniWordBits) codeInfo->codeParmN = iniWordBits;

      // skip ahead to start of next generator row
      position = skipWhiteSpace (endOfRow + 1);
      if (*position == ',') position++;
      }
   return NULL;
   }

//----------------------------------------------------------------------------
//
// readRawGenerator - Read binary linear code generator from a file, one
//                    generator row per line, base 2, 8, or 16.
//
static char *readRawGenerator (CODEINFO *codeInfo, char *filebuf)
   {
   char *error, *position, *buffer, *next;
   int base, maxBase;

   // figure out if binary, octal, or hex
   maxBase = 0;
   next = filebuf;
   while (next && !isEolChar (*next))
      {
      buffer = next;
      next = strchr (next, '\n');
      if (next) next++;
      position = skipWhiteSpace (buffer);

      // skip comments
      if (*position == ';'  || *position == '#') continue;
      if (*position == '\0') break;
      base = findBase (position);
      if (maxBase < base) maxBase = base;
      }
   base = maxBase;

   // initial generator allocation, will grow as rows are read
   codeInfo->codeParmK = 0;
   codeInfo->generator = calloca (sizeof codeInfo->generator [0] * (codeInfo->codeParmK + 1), INTEGER_ALIGNMENT);
   if (!codeInfo) return "memory allocation failed";

   next = filebuf;
   while (next && !isEolChar (*next))
      {
      INTEGER codeword;
      int index, iniWordBits;

      buffer = next;
      next = strchr (next, '\n');
      if (next) next++;
      position = skipWhiteSpace (buffer);

      // skip comments
      if (*position == ';'  || *position == '#') continue;
      if (*position == '\0') break;

      error = scanDigits (position, &codeword, base, 1);
      if (error) return error;

      // check for duplicate entries
      for (index = 0; index < codeInfo->codeParmK; index++)
         if (memcmp (&codeword, &codeInfo->generator [index], sizeof (codeword)) == 0)
            return formatMessage ("rows %d and %d are duplicate", index + 1, codeInfo->codeParmK + 1);
      
      // add codeword to generator
      codeInfo->generator [codeInfo->codeParmK++] = codeword;
      codeInfo->generator = realloca (codeInfo->generator, sizeof codeInfo->generator [0] * (codeInfo->codeParmK + 1), INTEGER_ALIGNMENT);
      if (!codeInfo->generator) return "memory allocation failed";

      // track the highest bit number to find N
      iniWordBits = highestSetBit (&codeword, MAXBITS) + 1;
      if (codeInfo->codeParmN < iniWordBits) codeInfo->codeParmN = iniWordBits;
      }
      
   return NULL;
   }

//----------------------------------------------------------------------------
//
// readIni - Read a binary linear code generator matrix from a file.
//           Use magma or GAP format, or use raw data format where each
//           generator row is represented by a single base 2,8 or 16 number,
//           one per line.
//
static char *readIni (CODEINFO *codeInfo, char *filename)
   {
   char *error, *buffer, *position;
   FILE *stream;
   size_t filelength;

   // read the ini file as binary into memory
   stream = fopen (filename, "rb");
   if (!stream) return formatMessage ("failed to open %s", filename);
   buffer = NULL;
   filelength = 0;
   for (;;)
      {
      int ch = fgetc (stream);
      if (ch == EOF) break;
      buffer = realloc (buffer, filelength + 2);
      if (buffer == NULL) return "memory allocation failed";
      buffer [filelength++] = ch;
      }
   buffer [filelength] = '\0';

   position = findMagmaGapGenerator (buffer);
   if (position) error = readMagmaGapGenerator (codeInfo, position);
   else error = readRawGenerator (codeInfo, buffer);
   codeInfo->codewords = 1ull << codeInfo->codeParmK;
   free (buffer);
   fclose (stream);
   return error;
   }

//----------------------------------------------------------------------------
// 
// generatecodeword - xor together selected generator rows to form a codeword
//
static void generatecodeword (INTEGER *generator, INTEGER *result, UINT64 rowSelect, int activeBits)
   {
   int bitNumber = 0;
   const INTEGER IntegerZero = {{0}};

   *result = IntegerZero;
   while (rowSelect)
      {
      if (rowSelect & 1) xorInteger (result, &generator [bitNumber], result, activeBits);
      bitNumber++;
      rowSelect >>= 1;
      }
   }

//----------------------------------------------------------------------------
//
// Core function for tabulating the weight distribution
//
// A lookup table lets the inner loop generate codewords using a single XOR operation.
// For each generated codeword, a counter corresponding to weight of the codeword is incremented.
//
// The next level loop uses a second lookup table to generate additional codeword bits
// using a single XOR operation.
//
// The outer loop generates upper code word bits. A lookup table is not needed for the
// outer loop. The dataStart and dataStop values are given so that the work can be
// dispatched to several threads.
//
static inline char *weightDistributionCore (INTEGER *generator, uint64_t dataStart, uint64_t dataStop, uint64_t *subtotal, INTEGER *lookup0, INTEGER *lookup1, int lookupCount0, int lookupCount1, int activeBits)
   {
   uint64_t data;
   int index0, index1;

   // the outer loop handles generator rows not covered by the lookup tables
   for (data = dataStart; data < dataStop; data += lookupCount0 * lookupCount1)
      {
      INTEGER codewordUpper;
      
      // xor together generator rows corresponding to bits outside the look tables
      generatecodeword (generator, &codewordUpper, data, activeBits);

      // combine this result with each lookup table entry
      for (index1 = 0; index1 < lookupCount1; index1++)
         {
         INTEGER codewordMid;
         xorInteger (&lookup1 [index1], &codewordUpper, &codewordMid, activeBits);

         // The lookup0 loop is manually unrolled for a significant performance gain.
         // The minimum value of iterations must be chosen accordingly
         for (index0 = 0; index0 < lookupCount0; index0 += 1 << LOOKUP0_UNROLL)
            {
            int bits;
            bits = popCntIntegerPair (&lookup0 [index0 + 0], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 1], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 2], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 3], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 4], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 5], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 6], &codewordMid, activeBits); subtotal [bits]++;
            bits = popCntIntegerPair (&lookup0 [index0 + 7], &codewordMid, activeBits); subtotal [bits]++;
            }
         }
      }
   return NULL;
   }

//----------------------------------------------------------------------------
//
// Core function for tabulating the weight distribution (unoptimized version)
//
// This function works the same as the above optimized version, except it does
// not support loop unrolling, multithreading, or lookup tables. It is used for
// codes too small to accomodate or benefit from these optimizations. Use of this
// function can also be forced by command line option: -unopt.
//
static inline char *weightDistributionCoreUnopt (CODEINFO *codeInfo, uint64_t *weightTable, int activeBits)
   {
   uint64_t data, codewords;

   codewords = 1ull << codeInfo->codeParmK;
   for (data = 0; data < codewords; data++)
      {
      int bits;
      INTEGER codeword;
      INTEGER IntegerZero = {{0}};

      generatecodeword (codeInfo->generator, &codeword, data, activeBits);

      bits = popCntIntegerPair (&codeword, &IntegerZero, activeBits);
      weightTable [bits]++;
      }
   return NULL;
   }

//----------------------------------------------------------------------------
//
// Core function for multithreaded mode. Worker threads grab the next available
// code section for processing. Use of a locked add instruction allows the 
// threads to get a unique section without the need for OS syncronization calls.
// Each thread updates a local set of weight counters, so that no locking or
// other MP syncronization is needed. When a thread completes a section, the
// shared weight table is updated using a locked add. A thread completion count
// is updated using locked add. When the final thread updates the completion
// counter, it signals the dispatching thread that all worker threads have completed.

static char *workerThread (uint64_t *subtotal, THREAD_STRUCT *threadStruct)
   {
   CODEINFO *codeInfo = threadStruct->codeInfo;
   INTEGER  *generator = codeInfo->generator;
   INTEGER  *lookup0 = threadStruct->lookup0;
   INTEGER  *lookup1 = threadStruct->lookup1;
   uint64_t *weightTable = threadStruct->weightTable;
   uint64_t sectionSize = 1ull << threadStruct->sectionBits;
   uint64_t dataStart, dataStop;
   int      count0 = threadStruct->lookupCount0;
   int      count1 = threadStruct->lookupCount1;
   int      activeBits = threadStruct->activeBits;

   // Normally the compiler can recognize when activeBits is set to a constant value
   // and optimize accordingly. But in this case, the compiler has no way of tracking
   // the activeBits variable because it is passed through an OS Call. To work around
   // the significant performance loss for fixed integer sized (256 and 512 bit), we 
   // can use the preprocessor definition directly.
   #if defined (FIXED_INTEGER_SIZE)
   activeBits = MAXBITS;
   #endif

   for (;;)
      {
      int weight;

      // completion check: when no more sections are available, the thread returns
      if (threadStruct->next >= threadStruct->codewords)
         {
         // Increment global completion counter. If completion count indicates this
         // is the last thread to complete, signal the dispatcher before returning.
         _InterlockedIncrement ((volatile long *) &threadStruct->completions);
         if (threadStruct->completions == threadStruct->threadCount)
            {
            BOOL success = SetEvent (threadStruct->allThreadsComplete);
            if (!success) return formatMessage ("SetEvent: %s", winErrorText (0));
            }
         return NULL;
         }

      // grab the next block of codewords for processing by this thread
      dataStop = _InterlockedAdd64 ((volatile int64_t *) &threadStruct->next, sectionSize);
      dataStart = dataStop - sectionSize;
      weightDistributionCore (generator, dataStart, dataStop, subtotal, lookup0, lookup1, count0, count1, activeBits);
      
      // add per-thread subtotals to shared grand totals
      for (weight = 0; weight < codeInfo->codeParmN + 1; weight++)
         {
         _InterlockedAdd64 ((volatile int64_t *) &weightTable [weight], subtotal [weight]);
         subtotal [weight] = 0;
         }
      }
   }

//----------------------------------------------------------------------------
//
// Thread entry point. This function allocates the thread's local weight counter
// array and then calls the main function for thread processing.
//
static unsigned int __stdcall workerThreadEntry (void *arg)
   {
   #define  bufferSize (sizeof subtotal [0] * (MAXBITS + 1))
   char *error;
   THREAD_STRUCT *threadStruct = arg;
   uint64_t *subtotal;
   uint8_t subtotalBuffer [bufferSize + INTEGER_ALIGNMENT];

   subtotal = alignAddress (subtotalBuffer, INTEGER_ALIGNMENT);
   memset (subtotal, 0, bufferSize);
   error = workerThread (subtotal, threadStruct);
   if (error) printf ("%s\n", error);

   return 0;
   }

//----------------------------------------------------------------------------
//
// Main function for tabulating the weight distribution. This function builds
// the two stages of codeword lookup tables then calls the main processing
// function. A special wrapper function is called for single threaded operation
// so that overhead of threaded mode is avoided. If the option for unoptimized
// operation is selected, this function calls the unoptimized weight distribution
// function instead.

static char *weightDistribution (CODEINFO *codeInfo, int threads, uint64_t *weightTable, int activeBits)
   {
   uint64_t codewords;
   int index, sectionBits, lookupCount0, lookupCount1;
   INTEGER *lookup0, *lookup1;
   THREAD_STRUCT *threadStruct;
   DWORD result;

   // number of codewords to evaluate: 2^K for full code, or reduced for quick benchmark
   codewords = codeInfo->codewords;

   // If K (generator row count) is small, do not use lookup tables or multithreading.
   // Use of this function can also be forced with command line option: -unopt
   if (codeInfo->unopt) return weightDistributionCoreUnopt (codeInfo, weightTable, activeBits);

   lookupCount0 = 1 << codeInfo->lookupBits0;
   lookupCount1 = 1 << codeInfo->lookupBits1;

   lookup0 = alignAddress (codewordLookup0, INTEGER_ALIGNMENT);
   lookup1 = alignAddress (codewordLookup1, INTEGER_ALIGNMENT);

   // Precompute a lookup table of the xor of a portion of the generator rows.
   // This reduces the average xor argument count to close to two, rather than K/2.
   for (index = 0; index < lookupCount0; index++)
     generatecodeword (codeInfo->generator, &lookup0 [index], index << 0, activeBits);

   for (index = 0; index < lookupCount1; index++)
      generatecodeword (codeInfo->generator, &lookup1 [index], index << codeInfo->lookupBits0, activeBits);

   // for single thread operation, call the core function directly to avoid threading overhead
   if (threads == 1)
      return weightDistributionCore (codeInfo->generator, 0, codewords, weightTable, lookup0, lookup1, lookupCount0, lookupCount1, activeBits);

   // divide into 16 sections per thread for parallel processing
   sectionBits = highestSetBit64 (codewords / threads / 16);

   // ensure each section covers a complete loop through the lookup tables
   sectionBits = max (sectionBits, codeInfo->lookupBits0 + codeInfo->lookupBits1);

   // ensure the total lookup table size does not exceed the generator row count
   if (codeInfo->lookupBits0 + codeInfo->lookupBits0 > codeInfo->codeParmK) return "lookup size > K";

   // start threads for parallel processing
   threadStruct = calloc (1, sizeof *threadStruct);
   if (!threadStruct) return "memory allocation failed";
   threadStruct->codeInfo = codeInfo;
   threadStruct->sectionBits = sectionBits;
   threadStruct->weightTable = weightTable;
   threadStruct->codewords = codewords;
   threadStruct->lookup0 = lookup0;
   threadStruct->lookup1 = lookup1;
   threadStruct->lookupCount0 = lookupCount0;
   threadStruct->lookupCount1 = lookupCount1;
   threadStruct->activeBits = activeBits;
   threadStruct->threadCount = threads;
   threadStruct->allThreadsComplete = CreateEvent (NULL, FALSE, FALSE, NULL);
   if (!threadStruct->allThreadsComplete) return formatMessage ("CreateEvent: %s", winErrorText (0));

   for (index = 0; index < threads; index++)
      {
      uintptr_t handle = _beginthreadex (NULL, 0, &workerThreadEntry, threadStruct, 0, NULL);
      if (!handle) return formatMessage ("_beginthreadex: %s", winErrorText (0));
      }

   // wait for completion
   result = WaitForSingleObject (threadStruct->allThreadsComplete, INFINITE);
   if (result == 0xFFFFFFFF) return formatMessage ("WaitForSingleObject: %s", winErrorText (0));
   if (result != WAIT_OBJECT_0) return formatMessage ("WaitForSingleObject: %Xh", result);

   free (threadStruct);
   return NULL;
   }

//----------------------------------------------------------------------------
//
// This wrapper function times the total weight distribution calculation and
// handles the optional priority boost.
//
static uint64_t timeWeights (CODEINFO *codeInfo, int threads, int activeBits, uint64_t *weightTable)
   {
   char     *error;
   DWORD    priority;
   uint64_t start, elapsed;

   // raise our priority to increase measurement accuracy
   priority = GetPriorityClass (GetCurrentProcess ());
   if (codeInfo->priority) SetPriorityClass (GetCurrentProcess (), ABOVE_NORMAL_PRIORITY_CLASS);
   start = queryPerformanceCounter ();
   error = weightDistribution (codeInfo, threads, weightTable, activeBits);
   elapsed = queryPerformanceCounter () - start;
   if (codeInfo->priority) SetPriorityClass (GetCurrentProcess (), priority);
   if (error) return printf ("%s\n", error), exit (1), 0;
   return elapsed;
   }

//----------------------------------------------------------------------------
//
// This wrapper function prints startup information, completion information,
// and the final weight table result.
//
static char *runWeights (CODEINFO *codeInfo, int threads, int activeBits)
   {
   int      eccWeight, minimumDistance = 0;
   uint64_t *weightTable, elapsed, totalcodewords;

   weightTable = calloca (sizeof weightTable [0] * (codeInfo->codeParmN + 1), INTEGER_ALIGNMENT);
   if (!weightTable) return "memory allocation failed";

   printf ("using %s %s algorithm, %d threads", PRECISION_TEXT, FUNCTION_TEXT, threads);
   if (codeInfo->lookupBits0 || codeInfo->lookupBits1) printf (", %d:%d lookup", codeInfo->lookupBits1, codeInfo->lookupBits0);
   printf ("\n");
   printf ("[%d, %d, ?]\n", codeInfo->codeParmN, codeInfo->codeParmK);

   elapsed = timeWeights (codeInfo, threads, activeBits, weightTable);

   totalcodewords = 0;
   for (eccWeight = 0; eccWeight <= codeInfo->codeParmN; eccWeight++)
      {
      uint64_t entry = weightTable [eccWeight];
      if (entry == 0) continue;
      if (eccWeight && !minimumDistance) minimumDistance = eccWeight;
      totalcodewords += entry;
      printf ("%-7d %llu\n", eccWeight, entry);
      }
   if (codeInfo->codewords != totalcodewords)
      return formatMessage ("----- expected %llu code words, but processed %llu -----\n", codeInfo->codewords, totalcodewords);

   printf ("[%d, %d, %d]\n", codeInfo->codeParmN, codeInfo->codeParmK, minimumDistance);
   printf ("elapsed time %.4G seconds\n", (double) elapsed / queryPerformanceFrequency ());
   freea (weightTable);
   return NULL;
   }

//----------------------------------------------------------------------------
//
// Time weight distibution function without printing anything. This is used at
// program startup to automatically select suitable lookup table sizes based
// on benchmark results.
//
static char *timeWeightsSilent (CODEINFO *codeInfo, int threads, int activeBits, uint64_t *elapsed)
   {
   uint64_t *weightTable;

   weightTable = calloca (sizeof weightTable [0] * (codeInfo->codeParmN + 1), INTEGER_ALIGNMENT);
   if (!weightTable) return "memory allocation failed";

  *elapsed = timeWeights (codeInfo, threads, activeBits, weightTable);

   freea (weightTable);
   return NULL;
   }

//----------------------------------------------------------------------------
//
// Tune lookup table sizes for best performance. A subset of codewords is
// processed using each of several preset lookup table sizes, and the final
// lookup table size is chosed based on best performance.
//
static char *tuneWeights (CODEINFO *codeInfo, int threads, int activeBits)
   {
   char *error;
   CODEINFO tune;
   static LOOKUP_TUNE lookupTune [] = {{4, 11}, {5, 9}, {5, 10}, {6, 9}, {7, 8}, {8, 7}, {9, 6}, {10, 6}, {11, 6}};
   uint64_t elapsed = 0, bestTime;
   int ms, reducedBits, index, bestIndex = 0, minimumLookupSize;

   codeInfo->lookupBits0 = 0;
   codeInfo->lookupBits1 = 0;

   // find smallest lookup bit total 
   minimumLookupSize = INT_MAX;
   for (index = 0; index < DIMENSION (lookupTune); index++)
      {
      int lookupBits = lookupTune [index].lookupBits0 + lookupTune [index].lookupBits1;
      if (minimumLookupSize > lookupBits) minimumLookupSize = lookupBits;
      }

   if (minimumLookupSize > SMALL_CODE_K) return "no lookupTune entries small enough small for SMALL_CODE_K";
   if (codeInfo->codeParmK < minimumLookupSize)
      {
      if (codeInfo->verbose) printf ("Generator row count is small, not using lookup tables\n");
      return NULL;
      }

   // work with a copy of the actual code, but reduce codeword count to limit execution time
   tune = *codeInfo;

   // find how much K must be reduced to make benchmark finish in a reasonable amount of time
   tune.lookupBits0 = min (6, codeInfo->codeParmK / 2);
   tune.lookupBits1 = min (6, codeInfo->codeParmK / 2);
   reducedBits = min (codeInfo->codeParmK, 10);
   tune.codewords = 1ull << reducedBits;
   while (++reducedBits < codeInfo->codeParmK)
      {
      error = timeWeightsSilent (&tune, threads, activeBits, &elapsed);
      if (error) return error;
      ms = elapsed * 1000 / queryPerformanceFrequency ();
      if (ms > 50) break;
      tune.codewords = 1ull << reducedBits;
      }
   if (codeInfo->verbose) printf ("using %d rows for benchmark\n", reducedBits);
   tune.codewords = 1ull << reducedBits;

   // do the benchmark runs and track the best time
   bestTime = 0xFFFFFFFFFFFFFFFF;
   for (index = 0; index < DIMENSION (lookupTune); index++)
      {
      tune.lookupBits0 = lookupTune [index].lookupBits0;
      tune.lookupBits1 = lookupTune [index].lookupBits1;
      if (tune.lookupBits0 + tune.lookupBits1 > tune.codeParmK) continue;
      error = timeWeightsSilent (&tune, threads, activeBits, &elapsed);
      if (error) return error;
      if (codeInfo->verbose) printf ("%d: [%2d, %2d] %.4G\n", index + 1, tune.lookupBits1, tune.lookupBits0, (double) elapsed * 1000 / queryPerformanceFrequency ());
      if (bestTime < elapsed) continue;
      bestTime = elapsed;
      bestIndex = index;
      }

   codeInfo->lookupBits0 = lookupTune [bestIndex].lookupBits0;
   codeInfo->lookupBits1 = lookupTune [bestIndex].lookupBits1;
   if (codeInfo->verbose) printf ("lookup size determined from benchmark run %d\n", bestIndex + 1);
   return NULL;
   }

//----------------------------------------------------------------------------
//
// Main program entry. This function processes command line arguments and then
// reads the file containing the generator matrix in order to find the number
// of bits in each codeword. Based on the number of bits and command line
// options, a particular processing function is called. One set of processing
// functions is dedicated to codes of 256 and fewer bits. Another handles codes
// of 512 of fewer bits. A third handles all larger codes (up to 65536 bits).
// Using 3 sets of functions lets the program benefit from optimizations for
// fixed sized operations without sacrificing the ability to process larger
// codewords.
// 
char *mainProg (char* (*main256)(), char* (*main512)(), int argc, char *argv [])
   {
   char     *error;
   int      verbose = 0, lookupKnown = 0, useFixedFunctions = 1, threads = 0, priority = 0, unopt = 0, weights = 1;
   int      activeBits, argCount;
   char     *fileName = NULL;
   CODEINFO *codeInfo;

   codeInfo = calloca (sizeof (CODEINFO), INTEGER_ALIGNMENT);
   if (!codeInfo) return "memory allocation failed";

   if (argc == 1) return helpScreen ();

   argCount = argc;
   while (--argCount)
      {
      char *position = argv [argCount];
      
      // skip args processed by the calling function
      if (*position == '\0') continue;

      if (*position == '-')
         {
         if (strcmp (position, "-verbose") == 0)
            verbose = 1;
         else if (memcmp (position, "-lookup=", 8) == 0)
            {
            char *comma = strchr (position, ':');
            if (!comma) return "missing colon";
            codeInfo->lookupBits1 = strtoul (position + 8, NULL, 10);
            codeInfo->lookupBits0 = strtoul (comma + 1, NULL, 10);
            if (codeInfo->lookupBits0 < LOOKUP0_UNROLL) return formatMessage ("Lower lookup minimum is %d due to loop unrolling", LOOKUP0_UNROLL);
            lookupKnown = 1;
            }
         else if (memcmp (position, "-threads=", 9) == 0)
            threads = strtoul (position + 9, NULL, 10);
         else if (strcmp (position, "-nofix") == 0)
            useFixedFunctions = 0;
         else if (strcmp (position, "-priority") == 0)
            priority = 1;
         else if (strcmp (position, "-unopt") == 0)
            unopt = 1;
         else if (strcmp (position, "-weights") == 0)
            weights = 1;
         else return formatMessage ("unexpected argument: %s", position);
         }
      else // option doesn't start with '-', must be file name
         {
         if (fileName) return "only one file name allowed";
         fileName = position;
         error = readIni (codeInfo, fileName);
         if (error) return error;
         }
      }

   #if defined (FIXED_INTEGER_SIZE)
   activeBits = MAXBITS;
   #else
   activeBits = roundUp (codeInfo->codeParmN, ELEMENT_BITS);
   #endif

   if (!fileName) return "generator file required";
   if (!weights) return "no operation selected on the command line";

   if (threads == 0) threads = atoi (getenv ("NUMBER_OF_PROCESSORS"));

   // if called from main.c, see if fixed precision code should be used
   if (main256 || main512)
      {
      if (useFixedFunctions)
         {
         if (activeBits <= 256)
            {
            freea (codeInfo->generator);
            freea (codeInfo);
            return main256 (NULL, NULL, argc, argv);
            }

         else if (activeBits <= 512)
            {
            freea (codeInfo->generator);
            freea (codeInfo);
            return main512 (NULL, NULL, argc, argv);
            }
         }
      }

   codeInfo->priority = priority;
   codeInfo->verbose = verbose;
   codeInfo->unopt = unopt;

   // if K (generator row count) is small, do not use lookup tables, unroll, or threads
   // can also be forced using command line option: -unopt
   if (codeInfo->codeParmK <= SMALL_CODE_K) codeInfo->unopt = 1;
   if (codeInfo->unopt)
      {
      threads = 1;
      codeInfo->lookupBits0 = 0;
      codeInfo->lookupBits1 = 0;
      lookupKnown = 1;
      }

   // if look table sizes not entered on command line, choose using a quick benchmark
   if (lookupKnown == 0)
      {
      error = tuneWeights (codeInfo, threads, activeBits);
      if (error) return error;
      }

   codeInfo->lookupBits0 = min (codeInfo->lookupBits0, codeInfo->codeParmK);
   codeInfo->lookupBits1 = min (codeInfo->lookupBits1, codeInfo->codeParmK - codeInfo->lookupBits0);
   if (1ull << (codeInfo->lookupBits0) > MAX_CODEWORD_LOOKUP0) return "codeword lookup table size too big";
   if (1ull << (codeInfo->lookupBits1) > MAX_CODEWORD_LOOKUP0) return "codeword lookup table size too big";

   error = runWeights (codeInfo, threads, activeBits);
   if (error) return error;

   freea (codeInfo->generator);
   freea (codeInfo);
   return NULL;
   }

//----------------------------------------------------------------------------
