BitNet/utils/test_gemm_kernel.sh

#!/bin/bash
# Unified GEMM kernel benchmark script
# Builds, tests, and benchmarks the GEMM kernel with configurable output

set -e

# Default values
BUILD_DIR="../build"
ITERATIONS=1000
OUTPUT_CSV=""
SKIP_BUILD=false
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Print usage
print_usage() {
    cat << EOF
Usage: $0 [options]

Options:
  -o, --output <path>     Output CSV file path (default: ../stats/gemm_kernel_test_noparal.csv)
  -i, --iterations <num>  Number of iterations per test (default: 1000)
  -s, --skip-build        Skip building the benchmark binary
  -h, --help              Show this help message

Examples:
  # Run with default settings
  $0

  # Specify custom output file
  $0 -o /path/to/my_results.csv

  # Quick test with fewer iterations
  $0 -i 100 -o quick_test.csv

  # Skip build if already compiled
  $0 -s -o results.csv
EOF
}

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        -o|--output)
            OUTPUT_CSV="$2"
            shift 2
            ;;
        -i|--iterations)
            ITERATIONS="$2"
            shift 2
            ;;
        -s|--skip-build)
            SKIP_BUILD=true
            shift
            ;;
        -h|--help)
            print_usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            print_usage
            exit 1
            ;;
    esac
done

# Set default output CSV if not specified
if [ -z "$OUTPUT_CSV" ]; then
    OUTPUT_CSV="${SCRIPT_DIR}/../stats/gemm_kernel_test_noparal.csv"
fi

# Create output directory first
mkdir -p "$(dirname "$OUTPUT_CSV")"

# Convert to absolute path
if [[ "$OUTPUT_CSV" = /* ]]; then
    # Already absolute path
    OUTPUT_CSV="$OUTPUT_CSV"
else
    # Convert relative path to absolute
    OUTPUT_CSV="$(cd "$(dirname "$OUTPUT_CSV")" && pwd)/$(basename "$OUTPUT_CSV")"
fi

echo "=========================================="
echo "GEMM Kernel Benchmark Suite"
echo "=========================================="
echo "Configuration:"
echo "  Iterations: $ITERATIONS"
echo "  Output CSV: $OUTPUT_CSV"
echo "  Skip build: $SKIP_BUILD"
echo "=========================================="
echo ""

# Build the benchmark binary
if [ "$SKIP_BUILD" = false ]; then
    echo "Step 1: Building GEMM kernel benchmark..."
    echo "------------------------------------------"

    CXX=${CXX:-g++}

    # Create build directory if it doesn't exist
    mkdir -p "${SCRIPT_DIR}/${BUILD_DIR}"

    # Create temporary C++ source file
    TEMP_CPP="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel_temp.cpp"

    cat > "${TEMP_CPP}" << 'EOF'
/**
 * Standalone benchmark for ggml_gemm_i2_i8_s kernel
 *
 * This program tests the performance of the ggml_gemm_i2_i8_s kernel
 * with configurable matrix sizes and iteration counts.
 *
 * Usage: ./test_gemm_kernel [options]
 *   -n <size>   : embedding dimension (must be divisible by 4, default: 2048)
 *   -r <rows>   : number of rows in matrix Y (default: 32)
 *   -c <cols>   : number of columns in matrix X (default: 128)
 *   -i <iters>  : number of iterations (default: 1000)
 *   -w <warmup> : number of warmup iterations (default: 10)
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdint.h>
#include <math.h>
#include <assert.h>

// Include necessary headers
#include "../include/gemm-config.h"

// Function declarations (from ggml-quants.h)
extern "C" void ggml_vec_dot_i2_i8_s(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc);

// GEMM kernel definition
void ggml_gemm_i2_i8_s(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
#if defined(ACT_PARALLEL)
    const int64_t row_block = ROW_BLOCK_SIZE;
    const int64_t col_block = COL_BLOCK_SIZE;

    for (int64_t c0 = 0; c0 < nc; c0 += col_block) {
        int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0);
        for (int64_t r0 = 0; r0 < nr; r0 += row_block) {
            int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0);
            const void * vy_r = (const uint8_t *)vy + r0 * n;
            for (int64_t c = 0; c < cur_c; ++c) {
                const int64_t col = c0 + c;
                float * s_col = s + col;
                const void * vx_col = (const uint8_t *)vx + col * n / 4;
                ggml_vec_dot_i2_i8_s(n, s_col + r0 * bs, bs, vx_col, n, vy_r, n, cur_r);
            }
        }
    }
#else
    const int64_t row_block = ROW_BLOCK_SIZE;
    const int64_t col_block = COL_BLOCK_SIZE;

    for (int64_t r0 = 0; r0 < nr; r0 += row_block) {
        int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0);
        for (int64_t c0 = 0; c0 < nc; c0 += col_block) {
            int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0);
            const void * vx_c = (const uint8_t *)vx + c0 * n / 4;
            for (int64_t r = 0; r < cur_r; ++r) {
                const int64_t row = r0 + r;
                float * s_row = s + row * bs;
                const void * vy_row = (const uint8_t *)vy + row * n;
                ggml_vec_dot_i2_i8_s(n, s_row + c0, bs, vx_c, n, vy_row, n, cur_c);
            }
        }
    }
#endif
}

// Helper function to get current time in nanoseconds
double get_time_ns() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1e9 + ts.tv_nsec;
}

// Initialize matrix with random i2 values (2-bit quantized)
void init_matrix_i2(uint8_t* data, int n, int cols) {
    // i2 format: 4 values per byte (2 bits each)
    int total_bytes = n * cols / 4;
    for (int i = 0; i < total_bytes; i++) {
        data[i] = rand() & 0xFF;
    }
}

// Initialize matrix with random i8 values
void init_matrix_i8(int8_t* data, int n, int rows) {
    int total_elements = n * rows;
    for (int i = 0; i < total_elements; i++) {
        data[i] = (int8_t)((rand() % 256) - 128);
    }
}

// Benchmark configuration
struct BenchmarkConfig {
    int n;           // embedding dimension (must be divisible by 4)
    int nr;          // number of rows in Y matrix
    int nc;          // number of columns in X matrix
    int iterations;  // number of benchmark iterations
    int warmup;      // number of warmup iterations
};

void print_config(const BenchmarkConfig& config) {
    printf("=" "=%.78s\n", "===============================================================================");
    printf("Benchmark Configuration:\n");
    printf("=" "=%.78s\n", "===============================================================================");
    printf("  Embedding dimension (n)    : %d\n", config.n);
    printf("  Matrix Y rows (nr)         : %d\n", config.nr);
    printf("  Matrix X columns (nc)      : %d\n", config.nc);
    printf("  Iterations                 : %d\n", config.iterations);
    printf("  Warmup iterations          : %d\n", config.warmup);
    printf("\nMatrix sizes:\n");
    printf("  X (i2): %d x %d (%.2f KB)\n", config.nc, config.n,
           (config.nc * config.n / 4) / 1024.0);
    printf("  Y (i8): %d x %d (%.2f KB)\n", config.nr, config.n,
           (config.nr * config.n) / 1024.0);
    printf("  S (f32): %d x %d (%.2f KB)\n", config.nr, config.nc,
           (config.nr * config.nc * sizeof(float)) / 1024.0);
    printf("\nGEMM Config:\n");
#if defined(ACT_PARALLEL)
    printf("  ACT_PARALLEL              : ON\n");
#else
    printf("  ACT_PARALLEL              : OFF\n");
#endif
    printf("  ROW_BLOCK_SIZE            : %d\n", ROW_BLOCK_SIZE);
    printf("  COL_BLOCK_SIZE            : %d\n", COL_BLOCK_SIZE);
    printf("  PARALLEL_SIZE             : %d\n", PARALLEL_SIZE);
    printf("=" "=%.78s\n\n", "===============================================================================");
}

void run_benchmark(const BenchmarkConfig& config) {
    // Allocate matrices
    printf("Allocating matrices...\n");

    // X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes
    // Align to 64 bytes for AVX-512, which is backward compatible with AVX2 (32 bytes)
    size_t x_size = config.nc * config.n / 4;
    size_t x_size_aligned = ((x_size + 63) / 64) * 64;
    uint8_t* X = (uint8_t*)aligned_alloc(64, x_size_aligned);

    // Y matrix (i8 format): nr x n
    size_t y_size = config.nr * config.n;
    size_t y_size_aligned = ((y_size + 63) / 64) * 64;
    int8_t* Y = (int8_t*)aligned_alloc(64, y_size_aligned);

    // Result matrix (float32): nr x nc
    size_t s_size = config.nr * config.nc * sizeof(float);
    size_t s_size_aligned = ((s_size + 63) / 64) * 64;
    float* S = (float*)aligned_alloc(64, s_size_aligned);

    if (!X || !Y || !S) {
        fprintf(stderr, "Failed to allocate memory\n");
        exit(1);
    }

    // Initialize matrices with random data
    printf("Initializing matrices with random data...\n");
    srand(time(NULL));
    init_matrix_i2(X, config.n, config.nc);
    init_matrix_i8(Y, config.n, config.nr);
    memset(S, 0, config.nr * config.nc * sizeof(float));

    // Warmup
    printf("Running %d warmup iterations...\n", config.warmup);
    for (int i = 0; i < config.warmup; i++) {
        ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc);
    }

    // Benchmark
    printf("Running %d benchmark iterations...\n", config.iterations);
    double total_time = 0.0;
    double min_time = 1e20;
    double max_time = 0.0;

    for (int i = 0; i < config.iterations; i++) {
        double start = get_time_ns();
        ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc);
        double end = get_time_ns();

        double elapsed = end - start;
        total_time += elapsed;
        if (elapsed < min_time) min_time = elapsed;
        if (elapsed > max_time) max_time = elapsed;

        if ((i + 1) % 100 == 0) {
            printf("  Progress: %d/%d iterations\n", i + 1, config.iterations);
        }
    }

    // Calculate statistics
    double avg_time_ns = total_time / config.iterations;
    double avg_time_ms = avg_time_ns / 1e6;
    double min_time_ms = min_time / 1e6;
    double max_time_ms = max_time / 1e6;

    // Calculate GFLOPS
    // For GEMM: nr x nc x n multiply-adds = 2 * nr * nc * n FLOPs
    double flops = 2.0 * config.nr * config.nc * config.n;
    double gflops = (flops / avg_time_ns);

    // Calculate throughput (tokens/s assuming each column is a token)
    double throughput = (config.nc * 1e9) / avg_time_ns;

    // Print results
    printf("\n");
    printf("=" "=%.78s\n", "===============================================================================");
    printf("Benchmark Results:\n");
    printf("=" "=%.78s\n", "===============================================================================");
    printf("  Average time  : %.3f ms\n", avg_time_ms);
    printf("  Min time      : %.3f ms\n", min_time_ms);
    printf("  Max time      : %.3f ms\n", max_time_ms);
    printf("  Std dev       : %.3f ms\n", sqrt((max_time_ms - min_time_ms) * (max_time_ms - min_time_ms) / 12));
    printf("\nPerformance:\n");
    printf("  GFLOPS        : %.2f\n", gflops);
    printf("  Throughput    : %.2f tokens/s\n", throughput);
    printf("  Latency/token : %.3f us\n", (avg_time_ms * 1000) / config.nc);
    printf("=" "=%.78s\n", "===============================================================================");

    // Cleanup
    free(X);
    free(Y);
    free(S);
}

void print_usage(const char* program) {
    printf("Usage: %s [options]\n", program);
    printf("Options:\n");
    printf("  -n <size>    Embedding dimension (must be divisible by 4, default: 2048)\n");
    printf("  -r <rows>    Number of rows in matrix Y (default: 32)\n");
    printf("  -c <cols>    Number of columns in matrix X (default: 128)\n");
    printf("  -i <iters>   Number of iterations (default: 1000)\n");
    printf("  -w <warmup>  Number of warmup iterations (default: 10)\n");
    printf("  -h           Show this help message\n");
}

int main(int argc, char** argv) {
    BenchmarkConfig config = {
        .n = 2048,
        .nr = 32,
        .nc = 128,
        .iterations = 1000,
        .warmup = 10
    };

    // Parse command line arguments
    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "-n") == 0 && i + 1 < argc) {
            config.n = atoi(argv[++i]);
        } else if (strcmp(argv[i], "-r") == 0 && i + 1 < argc) {
            config.nr = atoi(argv[++i]);
        } else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) {
            config.nc = atoi(argv[++i]);
        } else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) {
            config.iterations = atoi(argv[++i]);
        } else if (strcmp(argv[i], "-w") == 0 && i + 1 < argc) {
            config.warmup = atoi(argv[++i]);
        } else if (strcmp(argv[i], "-h") == 0) {
            print_usage(argv[0]);
            return 0;
        } else {
            fprintf(stderr, "Unknown option: %s\n", argv[i]);
            print_usage(argv[0]);
            return 1;
        }
    }

    // Validate configuration
    if (config.n % 4 != 0) {
        fprintf(stderr, "Error: Embedding dimension (-n) must be divisible by 4\n");
        return 1;
    }

    if (config.n <= 0 || config.nr <= 0 || config.nc <= 0 || config.iterations <= 0) {
        fprintf(stderr, "Error: All size parameters must be positive\n");
        return 1;
    }

    // Run benchmark
    print_config(config);
    run_benchmark(config);

    return 0;
}
EOF

    # Compiler flags
    CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
    CXXFLAGS+=" -I${SCRIPT_DIR}/.. -I${SCRIPT_DIR}/../include"
    CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/include"
    CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/src"
    CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/include"
    CXXFLAGS+=" -DNDEBUG -ffast-math"

    # Link flags
    LDFLAGS="-lm -lpthread"

    # Link with pre-built libraries
    GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
    GGML_SO="${GGML_LIB_DIR}/libggml.so"

    if [ ! -f "${GGML_SO}" ]; then
        echo "❌ Error: Cannot find libggml.so at ${GGML_SO}"
        echo "Please build the project first with: cmake --build build"
        rm -f "${TEMP_CPP}"
        exit 1
    fi

    LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,${GGML_LIB_DIR}"

    # Output binary
    BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"

    echo "Compiler: ${CXX}"
    echo "Building from embedded source..."
    echo ""

    # Build
    ${CXX} ${CXXFLAGS} "${TEMP_CPP}" -o ${BENCHMARK_BIN} ${LDFLAGS}

    if [ $? -eq 0 ]; then
        echo "✅ Build successful!"
        rm -f "${TEMP_CPP}"
        echo ""
    else
        echo "❌ Build failed!"
        rm -f "${TEMP_CPP}"
        exit 1
    fi
else
    echo "Step 1: Skipping build (using existing binary)"
    echo "------------------------------------------"
    BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"

    if [ ! -f "${BENCHMARK_BIN}" ]; then
        echo "❌ Error: Benchmark binary not found at ${BENCHMARK_BIN}"
        echo "Please run without -s to build it first."
        exit 1
    fi
    echo "✅ Found existing binary"
    echo ""
fi

# Set LD_LIBRARY_PATH to include the GGML library directory
GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
export LD_LIBRARY_PATH="${GGML_LIB_DIR}:${LD_LIBRARY_PATH}"

echo "Step 2: Running benchmark tests"
echo "------------------------------------------"
echo "Library path: ${GGML_LIB_DIR}"
echo ""

# Write CSV header
echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$OUTPUT_CSV"
echo "Results will be saved to: $OUTPUT_CSV"
echo ""

# Function to extract metrics and append to CSV
extract_and_save() {
    local test_name="$1"
    local output="$2"

    # Extract values using grep and awk
    local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
    local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
    local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
    local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
    local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
    local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
    local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
    local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')

    # Check if values were extracted successfully
    if [ -z "$avg_time" ] || [ -z "$min_time" ] || [ -z "$max_time" ]; then
        echo "Warning: Failed to extract timing data for ${test_name}"
        echo "${test_name},${n},${nr},${nc},N/A,N/A,N/A" >> "$OUTPUT_CSV"
        return
    fi

    # Calculate standard deviation estimate from range
    # Using awk with proper variable passing
    local std_time=$(awk -v min="$min_time" -v max="$max_time" 'BEGIN {printf "%.4f", (max - min) / 4}')

    # Format as mean±std
    local time_formatted="${avg_time}±${std_time}"

    # Append to CSV
    echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$OUTPUT_CSV"
}

# Run benchmark tests
echo "=========================================="
echo "BitNet-2B Typical Shapes Performance Test"
echo "=========================================="
echo ""

echo "Test 1: Single Token Generation (Attention QKV projection)"
echo "  Scenario: Generating 1 token at a time"
echo "  Shape: n=2048, r=1, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "single_token_gen" "$OUTPUT"
echo ""

echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
echo "  Scenario: Processing prompt with 128 tokens, batch size 1"
echo "  Shape: n=2048, r=128, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "small_batch_prompt" "$OUTPUT"
echo ""

echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
echo "  Scenario: Processing prompt with 256 tokens or batch of 256"
echo "  Shape: n=2048, r=256, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "medium_batch_prompt" "$OUTPUT"
echo ""

echo "Test 4: Large Batch Processing (Attention QKV projection)"
echo "  Scenario: Processing 512 tokens or batch of 512"
echo "  Shape: n=2048, r=512, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "large_batch_prompt" "$OUTPUT"
echo ""

echo "Test 5: FFN Up-projection (Small batch)"
echo "  Scenario: Feed-forward network expansion, 128 tokens"
echo "  Shape: n=2048, r=128, c=8192"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "ffn_up_projection" "$OUTPUT"
echo ""

echo "Test 6: FFN Down-projection (Small batch)"
echo "  Scenario: Feed-forward network reduction, 128 tokens"
echo "  Shape: n=8192, r=128, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "ffn_down_projection" "$OUTPUT"
echo ""

echo "Test 7: Long Context Processing"
echo "  Scenario: Processing very long context (2048 tokens)"
echo "  Shape: n=2048, r=2048, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "long_context" "$OUTPUT"
echo ""

echo "Test 8: Batched Token Generation"
echo "  Scenario: Generating tokens for 32 sequences simultaneously"
echo "  Shape: n=2048, r=32, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "batched_token_gen" "$OUTPUT"
echo ""

echo "=========================================="
echo "All tests completed successfully!"
echo "=========================================="
echo "Results saved to: $OUTPUT_CSV"
echo ""
echo "Summary:"
wc -l "$OUTPUT_CSV" | awk '{print "  Total records:", $1 - 1}'
echo "  Output file: $OUTPUT_CSV"
echo "=========================================="