//----------------------------------------------------------------------------
//
// blcutil - a binary linear code utility
//           copyright 2013 Scott Duplichan
//           This program is free software: you can redistribute it and/or modify
//           it under the terms of the GNU General Public License as published by
//           the Free Software Foundation, either version 3 of the License, or
//           (at your option) any later version.
//
//----------------------------------------------------------------------------

// this definition is needed to avoid performance problems with the gcc build on Win7
#define __USE_MINGW_ANSI_STDIO 1

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <windows.h>
#include <process.h>
#include <intrin.h>

#if defined (_MSC_VER)        // Microsoft compiler
#define inline __forceinline  // VS2012 doesn't recognize 'inline'
#endif

//----------------------------------------------------------------------------
// preprocessor definitions for creating the main entry point names
// mainprog256_gpr, mainprog512_gpr, mainprog_gpr, mainprog256_avx, ...
#define PP_PASTE(w, x, y, z) w ## x ## y ## z
#if MAXBITS == 512
#define MAIN_PROG_NAME(x) PP_PASTE (mainprog, 512, _, x)
#elif MAXBITS == 256
#define MAIN_PROG_NAME(x) PP_PASTE (mainprog, 256, _, x)
#else
#define MAIN_PROG_NAME(x) PP_PASTE (mainprog,    , _, x)
#endif

//----------------------------------------------------------------------------
// utility preprocessor definitions
#define DIMENSION(array) (sizeof (array) / sizeof (array [0]))
//----------------------------------------------------------------------------

// threshold for skipping multithreading, unrolling, and lookup table optimizations
#define SMALL_CODE_K 24

// ELEMENT_BITS - extended integer bit length must be a multiple of this value
//                loops can be optimized to process this size data on each pass
#define ELEMENT_BITS 256

// INTEGER_ALIGNMENT - alignment for allocated extended integer arrays
#define INTEGER_ALIGNMENT 256

// maximum codeword count for the two precomputed lookup tables
#define MAX_CODEWORD_LOOKUP0 (1 << 16)
#define MAX_CODEWORD_LOOKUP1 (1 << 16)

// buffer for codeword lookup tables
extern uint8_t codewordLookup0 [];
extern uint8_t codewordLookup1 [];

// Unroll factor for lookup0 loop. This #define is for helping the code
// ensure a compatible loop count is used.
#define LOOKUP0_UNROLL 3

//----------------------------------------------------------------------------

// native large integer that works most efficiently for this system
typedef uint64_t uintn_t;
#define UINTN_BITS    64
#define UINT8_BITS     8
#define UINT32_BITS   32
#define UINT64_BITS   64
#define UINT128_BITS 128
#define UINT256_BITS 256
#define UINTN_COUNT   (MAXBITS / UINTN_BITS)
#define UINT8_COUNT   (MAXBITS / UINT8_BITS)
#define UINT32_COUNT  (MAXBITS / UINT32_BITS)
#define UINT64_COUNT  (MAXBITS / UINT64_BITS)
#define UINT128_COUNT (MAXBITS / UINT128_BITS)
#define UINT256_COUNT (MAXBITS / UINT256_BITS)

// The weight distribution function most often operates on binary codes of
// length 256 or smaller. As an optimization, generate a separate execution
// path that operates with fixed 256 bit data size. This path handles codes
// of length 256 or less, and avoids the overhead associated with carrying
// the active bit count. Same for 512 bit.
#if (UINT256_COUNT == 1) || (UINT256_COUNT == 2)
#define FIXED_INTEGER_SIZE
#endif

//----------------------------------------------------------------------------
//
// text strings for printing algorithm information
//
#define PRECISION_TEXT MAXBITS==256? "256-bit fixed precision" \
                      :MAXBITS==512? "512-bit fixed precision" \
                      :              "variable precision"
#if defined (GPR_BUILD)
#define FUNCTION_TEXT "gpr"
#elif defined (XMMPOP_BUILD)
#define FUNCTION_TEXT "xmm popcnt"
#elif defined (YMMPOP_BUILD)
#define FUNCTION_TEXT "ymm popcnt"
#elif defined (SSE_BUILD)
#define FUNCTION_TEXT "xmm sse simd"
#elif defined (AVX_BUILD)
#define FUNCTION_TEXT "xmm avx simd"
#elif defined (AVX2_BUILD)
#define FUNCTION_TEXT "ymm avx2 simd"
#endif

//----------------------------------------------------------------------------
//
// extended integer structure
//
typedef union
   {
   uintn_t    uintn   [UINTN_COUNT];
   uint8_t    uint8   [UINT8_COUNT];
   uint32_t   uint32  [UINT32_COUNT];
   uint64_t   uint64  [UINT64_COUNT];
   __m128i    m128i   [UINT128_COUNT]; // XMM data, x86 only
   #if defined (YMMPOP_BUILD) || defined (AVX2_BUILD)
   __m256d    m256d   [UINT256_COUNT]; // YMM data, x86 only
   __m256i    m256i   [UINT256_COUNT]; // YMM data, x86 only
   #endif
   }
INTEGER;

//----------------------------------------------------------------------------
//
// CODEINFO - binary linear code generator and misc info
//            codes are in non-systematic form
//
typedef struct
   {
   INTEGER  *generator;                // generator matrix array (K entries)
   int      codeParmK;                 // number data (information) bits
   int      codeParmN;                 // number of error detection bits
   int      lookupBits0, lookupBits1;  // number of bits handled by lookup tables
   int      priority;                  // run at higher priority
   int      verbose;                   // enable extra debug messages
   int      unopt;                     // force unoptimized algorithm (single thread, no lookup, no unroll)
   uint64_t codewords;                 // normally (1 << codeParmN), reduced for quick benchmark
   }
CODEINFO;

//---------------------------------------------------------------------------
//
// structure for predefined codeword lookup table sizes
// 
typedef struct
   {
   int lookupBits0;  // number of least significant data bits handled by lookup
   int lookupBits1;  // number of mid data bits handled by lookup
   }
LOOKUP_TUNE;

//----------------------------------------------------------------------------
//
// structure passed to a dispatched thread
//
typedef struct
   {
   CODEINFO *codeInfo;        // pointer to codeInfo struct
   HANDLE allThreadsComplete; // event handle to tell main that all threads have completed
   int sectionBits;           // code is broken into sections of this size for thread processing
   uint64_t *weightTable;     // consolidated result updated by threads
   uint64_t next;             // data bit pattern for next section to process
   uint64_t codewords;        // total number of codewords (2^K)
   INTEGER *lookup0;          // codework lookup tble for lower data bits
   INTEGER *lookup1;          // codework lookup tble for mid data bits
   int lookupCount0;          // number of codewords in lower lookup table
   int lookupCount1;          // number of codewords in mid lookup table
   int activeBits;            // bits used in calculation (N rounded up to multiple of 256)
   int completions;           // thread completion count, for determining overall completion
   int threadCount;           // thread completion target value
   }
THREAD_STRUCT;

//----------------------------------------------------------------------------
// various ways to extract xmm data to a gpr register
//----------------------------------------------------------------------------
#define _mm_extract_epi64_custom1(xmm, immediate)        \
immediate == 0 ?                                         \
_mm_cvtsi128_si64 (xmm)                                  \
:                                                        \
_mm_cvtsi128_si64 (_mm_srli_si128 (xmm, 8 * immediate))

//----------------------------------------------------------------------------

#define _mm_extract_epi64_custom2(xmm, immediate)        \
immediate == 0 ?                                         \
_mm_cvtsi128_si64 (xmm)                                  \
:                                                        \
_mm_cvtsi128_si64 (_mm_shuffle_epi32 (xmm,               \
   _MM_SHUFFLE (immediate * 2 + 1, immediate * 2 + 0,    \
                immediate * 2 + 1, immediate * 2 + 0)))

//----------------------------------------------------------------------------
// the best choice here varies with compiler, calling code, and processor

//#define _mm_extract_epi64_custom(xmm, immediate) _mm_extract_epi64         (xmm, immediate)
//#define _mm_extract_epi64_custom(xmm, immediate) _mm_extract_epi64_custom1 (xmm, immediate)
#define _mm_extract_epi64_custom(xmm, immediate) _mm_extract_epi64_custom2 (xmm, immediate)

//---------------------------------------------------------------------------
// function prototypes
//---------------------------------------------------------------------------
// utility.c
char *formatMessage (char *szFmt,...);
uint64_t queryPerformanceCounter (void);
uint64_t queryPerformanceFrequency (void);
void *calloca (size_t size, size_t align);
void freea (void *buffer);
void *realloca (void *buffer, size_t size, size_t align);
int roundUp (int value, int n);
void *alignAddress (void *address, int align);
char *skipWhiteSpace (char *position);
char *winErrorText (int code);
size_t removeTrailingCrLf (char *buffer, size_t length);

// misc_gpr.c
int extractBit (INTEGER *data, int bitNumber);
void setBit (INTEGER *data, int bitnumber);
int highestSetBit64 (uint64_t data);
int highestSetBit (INTEGER *data, int activeBits);

// weightdistribution.c
char *mainprog_gpr        (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog_xmmpop     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog_ymmpop     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog_sse        (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog_avx        (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog_avx2       (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog256_gpr     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog256_xmmpop  (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog256_ymmpop  (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog256_sse     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog256_avx     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog256_avx2    (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog512_gpr     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog512_xmmpop  (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog512_ymmpop  (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog512_sse     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog512_avx     (char* (*main256)(), char* (*main512)(), int argc, char *argv []);
char *mainprog512_avx2    (char* (*main256)(), char* (*main512)(), int argc, char *argv []);

// main.c
char *helpScreen (void);

//----------------------------------------------------------------------------
