#!/bin/bash # Unified GEMM kernel benchmark script # Builds, tests, and benchmarks the GEMM kernel with configurable output set -e # Default values BUILD_DIR="../build" ITERATIONS=1000 OUTPUT_CSV="" SKIP_BUILD=false SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Print usage print_usage() { cat << EOF Usage: $0 [options] Options: -o, --output Output CSV file path (default: ../stats/gemm_kernel_test_noparal.csv) -i, --iterations Number of iterations per test (default: 1000) -s, --skip-build Skip building the benchmark binary -h, --help Show this help message Examples: # Run with default settings $0 # Specify custom output file $0 -o /path/to/my_results.csv # Quick test with fewer iterations $0 -i 100 -o quick_test.csv # Skip build if already compiled $0 -s -o results.csv EOF } # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in -o|--output) OUTPUT_CSV="$2" shift 2 ;; -i|--iterations) ITERATIONS="$2" shift 2 ;; -s|--skip-build) SKIP_BUILD=true shift ;; -h|--help) print_usage exit 0 ;; *) echo "Unknown option: $1" print_usage exit 1 ;; esac done # Set default output CSV if not specified if [ -z "$OUTPUT_CSV" ]; then OUTPUT_CSV="${SCRIPT_DIR}/../stats/gemm_kernel_test_noparal.csv" fi # Create output directory first mkdir -p "$(dirname "$OUTPUT_CSV")" # Convert to absolute path if [[ "$OUTPUT_CSV" = /* ]]; then # Already absolute path OUTPUT_CSV="$OUTPUT_CSV" else # Convert relative path to absolute OUTPUT_CSV="$(cd "$(dirname "$OUTPUT_CSV")" && pwd)/$(basename "$OUTPUT_CSV")" fi echo "==========================================" echo "GEMM Kernel Benchmark Suite" echo "==========================================" echo "Configuration:" echo " Iterations: $ITERATIONS" echo " Output CSV: $OUTPUT_CSV" echo " Skip build: $SKIP_BUILD" echo "==========================================" echo "" # Build the benchmark binary if [ "$SKIP_BUILD" = false ]; then echo "Step 1: Building GEMM kernel benchmark..." echo "------------------------------------------" CXX=${CXX:-g++} # Create build directory if it doesn't exist mkdir -p "${SCRIPT_DIR}/${BUILD_DIR}" # Create temporary C++ source file TEMP_CPP="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel_temp.cpp" cat > "${TEMP_CPP}" << 'EOF' /** * Standalone benchmark for ggml_gemm_i2_i8_s kernel * * This program tests the performance of the ggml_gemm_i2_i8_s kernel * with configurable matrix sizes and iteration counts. * * Usage: ./test_gemm_kernel [options] * -n : embedding dimension (must be divisible by 4, default: 2048) * -r : number of rows in matrix Y (default: 32) * -c : number of columns in matrix X (default: 128) * -i : number of iterations (default: 1000) * -w : number of warmup iterations (default: 10) */ #include #include #include #include #include #include #include // Include necessary headers #include "../include/gemm-config.h" // Function declarations (from ggml-quants.h) extern "C" void ggml_vec_dot_i2_i8_s(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc); // GEMM kernel definition void ggml_gemm_i2_i8_s(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { #if defined(ACT_PARALLEL) const int64_t row_block = ROW_BLOCK_SIZE; const int64_t col_block = COL_BLOCK_SIZE; for (int64_t c0 = 0; c0 < nc; c0 += col_block) { int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0); for (int64_t r0 = 0; r0 < nr; r0 += row_block) { int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0); const void * vy_r = (const uint8_t *)vy + r0 * n; for (int64_t c = 0; c < cur_c; ++c) { const int64_t col = c0 + c; float * s_col = s + col; const void * vx_col = (const uint8_t *)vx + col * n / 4; ggml_vec_dot_i2_i8_s(n, s_col + r0 * bs, bs, vx_col, n, vy_r, n, cur_r); } } } #else const int64_t row_block = ROW_BLOCK_SIZE; const int64_t col_block = COL_BLOCK_SIZE; for (int64_t r0 = 0; r0 < nr; r0 += row_block) { int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0); for (int64_t c0 = 0; c0 < nc; c0 += col_block) { int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0); const void * vx_c = (const uint8_t *)vx + c0 * n / 4; for (int64_t r = 0; r < cur_r; ++r) { const int64_t row = r0 + r; float * s_row = s + row * bs; const void * vy_row = (const uint8_t *)vy + row * n; ggml_vec_dot_i2_i8_s(n, s_row + c0, bs, vx_c, n, vy_row, n, cur_c); } } } #endif } // Helper function to get current time in nanoseconds double get_time_ns() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * 1e9 + ts.tv_nsec; } // Initialize matrix with random i2 values (2-bit quantized) void init_matrix_i2(uint8_t* data, int n, int cols) { // i2 format: 4 values per byte (2 bits each) int total_bytes = n * cols / 4; for (int i = 0; i < total_bytes; i++) { data[i] = rand() & 0xFF; } } // Initialize matrix with random i8 values void init_matrix_i8(int8_t* data, int n, int rows) { int total_elements = n * rows; for (int i = 0; i < total_elements; i++) { data[i] = (int8_t)((rand() % 256) - 128); } } // Benchmark configuration struct BenchmarkConfig { int n; // embedding dimension (must be divisible by 4) int nr; // number of rows in Y matrix int nc; // number of columns in X matrix int iterations; // number of benchmark iterations int warmup; // number of warmup iterations }; void print_config(const BenchmarkConfig& config) { printf("=" "=%.78s\n", "==============================================================================="); printf("Benchmark Configuration:\n"); printf("=" "=%.78s\n", "==============================================================================="); printf(" Embedding dimension (n) : %d\n", config.n); printf(" Matrix Y rows (nr) : %d\n", config.nr); printf(" Matrix X columns (nc) : %d\n", config.nc); printf(" Iterations : %d\n", config.iterations); printf(" Warmup iterations : %d\n", config.warmup); printf("\nMatrix sizes:\n"); printf(" X (i2): %d x %d (%.2f KB)\n", config.nc, config.n, (config.nc * config.n / 4) / 1024.0); printf(" Y (i8): %d x %d (%.2f KB)\n", config.nr, config.n, (config.nr * config.n) / 1024.0); printf(" S (f32): %d x %d (%.2f KB)\n", config.nr, config.nc, (config.nr * config.nc * sizeof(float)) / 1024.0); printf("\nGEMM Config:\n"); #if defined(ACT_PARALLEL) printf(" ACT_PARALLEL : ON\n"); #else printf(" ACT_PARALLEL : OFF\n"); #endif printf(" ROW_BLOCK_SIZE : %d\n", ROW_BLOCK_SIZE); printf(" COL_BLOCK_SIZE : %d\n", COL_BLOCK_SIZE); printf(" PARALLEL_SIZE : %d\n", PARALLEL_SIZE); printf("=" "=%.78s\n\n", "==============================================================================="); } void run_benchmark(const BenchmarkConfig& config) { // Allocate matrices printf("Allocating matrices...\n"); // X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes // Align to 64 bytes for AVX-512, which is backward compatible with AVX2 (32 bytes) size_t x_size = config.nc * config.n / 4; size_t x_size_aligned = ((x_size + 63) / 64) * 64; uint8_t* X = (uint8_t*)aligned_alloc(64, x_size_aligned); // Y matrix (i8 format): nr x n size_t y_size = config.nr * config.n; size_t y_size_aligned = ((y_size + 63) / 64) * 64; int8_t* Y = (int8_t*)aligned_alloc(64, y_size_aligned); // Result matrix (float32): nr x nc size_t s_size = config.nr * config.nc * sizeof(float); size_t s_size_aligned = ((s_size + 63) / 64) * 64; float* S = (float*)aligned_alloc(64, s_size_aligned); if (!X || !Y || !S) { fprintf(stderr, "Failed to allocate memory\n"); exit(1); } // Initialize matrices with random data printf("Initializing matrices with random data...\n"); srand(time(NULL)); init_matrix_i2(X, config.n, config.nc); init_matrix_i8(Y, config.n, config.nr); memset(S, 0, config.nr * config.nc * sizeof(float)); // Warmup printf("Running %d warmup iterations...\n", config.warmup); for (int i = 0; i < config.warmup; i++) { ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc); } // Benchmark printf("Running %d benchmark iterations...\n", config.iterations); double total_time = 0.0; double min_time = 1e20; double max_time = 0.0; for (int i = 0; i < config.iterations; i++) { double start = get_time_ns(); ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc); double end = get_time_ns(); double elapsed = end - start; total_time += elapsed; if (elapsed < min_time) min_time = elapsed; if (elapsed > max_time) max_time = elapsed; if ((i + 1) % 100 == 0) { printf(" Progress: %d/%d iterations\n", i + 1, config.iterations); } } // Calculate statistics double avg_time_ns = total_time / config.iterations; double avg_time_ms = avg_time_ns / 1e6; double min_time_ms = min_time / 1e6; double max_time_ms = max_time / 1e6; // Calculate GFLOPS // For GEMM: nr x nc x n multiply-adds = 2 * nr * nc * n FLOPs double flops = 2.0 * config.nr * config.nc * config.n; double gflops = (flops / avg_time_ns); // Calculate throughput (tokens/s assuming each column is a token) double throughput = (config.nc * 1e9) / avg_time_ns; // Print results printf("\n"); printf("=" "=%.78s\n", "==============================================================================="); printf("Benchmark Results:\n"); printf("=" "=%.78s\n", "==============================================================================="); printf(" Average time : %.3f ms\n", avg_time_ms); printf(" Min time : %.3f ms\n", min_time_ms); printf(" Max time : %.3f ms\n", max_time_ms); printf(" Std dev : %.3f ms\n", sqrt((max_time_ms - min_time_ms) * (max_time_ms - min_time_ms) / 12)); printf("\nPerformance:\n"); printf(" GFLOPS : %.2f\n", gflops); printf(" Throughput : %.2f tokens/s\n", throughput); printf(" Latency/token : %.3f us\n", (avg_time_ms * 1000) / config.nc); printf("=" "=%.78s\n", "==============================================================================="); // Cleanup free(X); free(Y); free(S); } void print_usage(const char* program) { printf("Usage: %s [options]\n", program); printf("Options:\n"); printf(" -n Embedding dimension (must be divisible by 4, default: 2048)\n"); printf(" -r Number of rows in matrix Y (default: 32)\n"); printf(" -c Number of columns in matrix X (default: 128)\n"); printf(" -i Number of iterations (default: 1000)\n"); printf(" -w Number of warmup iterations (default: 10)\n"); printf(" -h Show this help message\n"); } int main(int argc, char** argv) { BenchmarkConfig config = { .n = 2048, .nr = 32, .nc = 128, .iterations = 1000, .warmup = 10 }; // Parse command line arguments for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-n") == 0 && i + 1 < argc) { config.n = atoi(argv[++i]); } else if (strcmp(argv[i], "-r") == 0 && i + 1 < argc) { config.nr = atoi(argv[++i]); } else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) { config.nc = atoi(argv[++i]); } else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) { config.iterations = atoi(argv[++i]); } else if (strcmp(argv[i], "-w") == 0 && i + 1 < argc) { config.warmup = atoi(argv[++i]); } else if (strcmp(argv[i], "-h") == 0) { print_usage(argv[0]); return 0; } else { fprintf(stderr, "Unknown option: %s\n", argv[i]); print_usage(argv[0]); return 1; } } // Validate configuration if (config.n % 4 != 0) { fprintf(stderr, "Error: Embedding dimension (-n) must be divisible by 4\n"); return 1; } if (config.n <= 0 || config.nr <= 0 || config.nc <= 0 || config.iterations <= 0) { fprintf(stderr, "Error: All size parameters must be positive\n"); return 1; } // Run benchmark print_config(config); run_benchmark(config); return 0; } EOF # Compiler flags CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp" CXXFLAGS+=" -I${SCRIPT_DIR}/.. -I${SCRIPT_DIR}/../include" CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/include" CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/src" CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/include" CXXFLAGS+=" -DNDEBUG -ffast-math" # Link flags LDFLAGS="-lm -lpthread" # Link with pre-built libraries GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src" GGML_SO="${GGML_LIB_DIR}/libggml.so" if [ ! -f "${GGML_SO}" ]; then echo "❌ Error: Cannot find libggml.so at ${GGML_SO}" echo "Please build the project first with: cmake --build build" rm -f "${TEMP_CPP}" exit 1 fi LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,${GGML_LIB_DIR}" # Output binary BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel" echo "Compiler: ${CXX}" echo "Building from embedded source..." echo "" # Build ${CXX} ${CXXFLAGS} "${TEMP_CPP}" -o ${BENCHMARK_BIN} ${LDFLAGS} if [ $? -eq 0 ]; then echo "✅ Build successful!" rm -f "${TEMP_CPP}" echo "" else echo "❌ Build failed!" rm -f "${TEMP_CPP}" exit 1 fi else echo "Step 1: Skipping build (using existing binary)" echo "------------------------------------------" BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel" if [ ! -f "${BENCHMARK_BIN}" ]; then echo "❌ Error: Benchmark binary not found at ${BENCHMARK_BIN}" echo "Please run without -s to build it first." exit 1 fi echo "✅ Found existing binary" echo "" fi # Set LD_LIBRARY_PATH to include the GGML library directory GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src" export LD_LIBRARY_PATH="${GGML_LIB_DIR}:${LD_LIBRARY_PATH}" echo "Step 2: Running benchmark tests" echo "------------------------------------------" echo "Library path: ${GGML_LIB_DIR}" echo "" # Write CSV header echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$OUTPUT_CSV" echo "Results will be saved to: $OUTPUT_CSV" echo "" # Function to extract metrics and append to CSV extract_and_save() { local test_name="$1" local output="$2" # Extract values using grep and awk local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}') local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}') local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}') local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}') local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}') local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}') local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}') local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}') # Check if values were extracted successfully if [ -z "$avg_time" ] || [ -z "$min_time" ] || [ -z "$max_time" ]; then echo "Warning: Failed to extract timing data for ${test_name}" echo "${test_name},${n},${nr},${nc},N/A,N/A,N/A" >> "$OUTPUT_CSV" return fi # Calculate standard deviation estimate from range # Using awk with proper variable passing local std_time=$(awk -v min="$min_time" -v max="$max_time" 'BEGIN {printf "%.4f", (max - min) / 4}') # Format as mean±std local time_formatted="${avg_time}±${std_time}" # Append to CSV echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$OUTPUT_CSV" } # Run benchmark tests echo "==========================================" echo "BitNet-2B Typical Shapes Performance Test" echo "==========================================" echo "" echo "Test 1: Single Token Generation (Attention QKV projection)" echo " Scenario: Generating 1 token at a time" echo " Shape: n=2048, r=1, c=2048" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "single_token_gen" "$OUTPUT" echo "" echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)" echo " Scenario: Processing prompt with 128 tokens, batch size 1" echo " Shape: n=2048, r=128, c=2048" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "small_batch_prompt" "$OUTPUT" echo "" echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)" echo " Scenario: Processing prompt with 256 tokens or batch of 256" echo " Shape: n=2048, r=256, c=2048" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "medium_batch_prompt" "$OUTPUT" echo "" echo "Test 4: Large Batch Processing (Attention QKV projection)" echo " Scenario: Processing 512 tokens or batch of 512" echo " Shape: n=2048, r=512, c=2048" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "large_batch_prompt" "$OUTPUT" echo "" echo "Test 5: FFN Up-projection (Small batch)" echo " Scenario: Feed-forward network expansion, 128 tokens" echo " Shape: n=2048, r=128, c=8192" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "ffn_up_projection" "$OUTPUT" echo "" echo "Test 6: FFN Down-projection (Small batch)" echo " Scenario: Feed-forward network reduction, 128 tokens" echo " Shape: n=8192, r=128, c=2048" OUTPUT=$($BENCHMARK_BIN -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "ffn_down_projection" "$OUTPUT" echo "" echo "Test 7: Long Context Processing" echo " Scenario: Processing very long context (2048 tokens)" echo " Shape: n=2048, r=2048, c=2048" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "long_context" "$OUTPUT" echo "" echo "Test 8: Batched Token Generation" echo " Scenario: Generating tokens for 32 sequences simultaneously" echo " Shape: n=2048, r=32, c=2048" OUTPUT=$($BENCHMARK_BIN -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1) echo "$OUTPUT" extract_and_save "batched_token_gen" "$OUTPUT" echo "" echo "==========================================" echo "All tests completed successfully!" echo "==========================================" echo "Results saved to: $OUTPUT_CSV" echo "" echo "Summary:" wc -l "$OUTPUT_CSV" | awk '{print " Total records:", $1 - 1}' echo " Output file: $OUTPUT_CSV" echo "=========================================="