diff --git a/utils/build_test_gemm_kernel.sh b/utils/build_test_gemm_kernel.sh deleted file mode 100755 index bc45942..0000000 --- a/utils/build_test_gemm_kernel.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# Build script for standalone GEMM kernel benchmark - -set -e - -echo "Building GEMM kernel benchmark..." - -# Compiler settings -CXX=${CXX:-g++} -BUILD_DIR="../build" -SRC_DIR="../src" - -# Create build directory if it doesn't exist -mkdir -p ${BUILD_DIR} - -# Compiler flags -CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp" -CXXFLAGS+=" -I.. -I../include" -CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/include" -CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/src" -CXXFLAGS+=" -I../3rdparty/llama.cpp/include" -CXXFLAGS+=" -DNDEBUG -ffast-math" - -# Link flags -LDFLAGS="-lm -lpthread" - -# Link with pre-built libraries -GGML_LIB_DIR="../build/3rdparty/llama.cpp/ggml/src" -GGML_SO="${GGML_LIB_DIR}/libggml.so" - -if [ ! -f "${GGML_SO}" ]; then - echo "⚠️ Warning: Cannot find libggml.so" - echo "Please build the project first with: cmake --build build" - exit 1 -fi - -LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,\$ORIGIN/../../${GGML_LIB_DIR}" -echo "Linking with libggml.so" - -# Source files -SOURCES="./test_gemm_kernel.cpp" - -# Output binary -OUTPUT="${BUILD_DIR}/test_gemm_kernel" - -echo "Compiler: ${CXX}" -echo "Flags: ${CXXFLAGS}" -echo "Sources: ${SOURCES}" -echo "" - -# Build -${CXX} ${CXXFLAGS} ${SOURCES} -o ${OUTPUT} ${LDFLAGS} - -if [ $? -eq 0 ]; then - echo "" - echo "✅ Build successful!" - echo "Output: ${OUTPUT}" - echo "" - echo "Usage examples:" - echo " # Default test (n=2048, nr=32, nc=128, 1000 iterations)" - echo " ${OUTPUT}" - echo "" - echo " # Custom matrix sizes" - echo " ${OUTPUT} -n 4096 -r 64 -c 256" - echo "" - echo " # Quick test (fewer iterations)" - echo " ${OUTPUT} -i 100 -w 5" - echo "" - echo " # Large-scale test" - echo " ${OUTPUT} -n 3200 -r 128 -c 512 -i 500" - echo "" -else - echo "" - echo "❌ Build failed!" - exit 1 -fi diff --git a/utils/kernel_tuning.py b/utils/kernel_tuning.py deleted file mode 100644 index e69de29..0000000 diff --git a/utils/test_gemm_kernel.cpp b/utils/test_gemm_kernel.sh old mode 100644 new mode 100755 similarity index 51% rename from utils/test_gemm_kernel.cpp rename to utils/test_gemm_kernel.sh index 36964ce..ae72c72 --- a/utils/test_gemm_kernel.cpp +++ b/utils/test_gemm_kernel.sh @@ -1,3 +1,110 @@ +#!/bin/bash +# Unified GEMM kernel benchmark script +# Builds, tests, and benchmarks the GEMM kernel with configurable output + +set -e + +# Default values +BUILD_DIR="../build" +ITERATIONS=1000 +OUTPUT_CSV="" +SKIP_BUILD=false +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Print usage +print_usage() { + cat << EOF +Usage: $0 [options] + +Options: + -o, --output Output CSV file path (default: ../stats/gemm_kernel_test_noparal.csv) + -i, --iterations Number of iterations per test (default: 1000) + -s, --skip-build Skip building the benchmark binary + -h, --help Show this help message + +Examples: + # Run with default settings + $0 + + # Specify custom output file + $0 -o /path/to/my_results.csv + + # Quick test with fewer iterations + $0 -i 100 -o quick_test.csv + + # Skip build if already compiled + $0 -s -o results.csv +EOF +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -o|--output) + OUTPUT_CSV="$2" + shift 2 + ;; + -i|--iterations) + ITERATIONS="$2" + shift 2 + ;; + -s|--skip-build) + SKIP_BUILD=true + shift + ;; + -h|--help) + print_usage + exit 0 + ;; + *) + echo "Unknown option: $1" + print_usage + exit 1 + ;; + esac +done + +# Set default output CSV if not specified +if [ -z "$OUTPUT_CSV" ]; then + OUTPUT_CSV="${SCRIPT_DIR}/../stats/gemm_kernel_test_noparal.csv" +fi + +# Create output directory first +mkdir -p "$(dirname "$OUTPUT_CSV")" + +# Convert to absolute path +if [[ "$OUTPUT_CSV" = /* ]]; then + # Already absolute path + OUTPUT_CSV="$OUTPUT_CSV" +else + # Convert relative path to absolute + OUTPUT_CSV="$(cd "$(dirname "$OUTPUT_CSV")" && pwd)/$(basename "$OUTPUT_CSV")" +fi + +echo "==========================================" +echo "GEMM Kernel Benchmark Suite" +echo "==========================================" +echo "Configuration:" +echo " Iterations: $ITERATIONS" +echo " Output CSV: $OUTPUT_CSV" +echo " Skip build: $SKIP_BUILD" +echo "==========================================" +echo "" + +# Build the benchmark binary +if [ "$SKIP_BUILD" = false ]; then + echo "Step 1: Building GEMM kernel benchmark..." + echo "------------------------------------------" + + CXX=${CXX:-g++} + + # Create build directory if it doesn't exist + mkdir -p "${SCRIPT_DIR}/${BUILD_DIR}" + + # Create temporary C++ source file + TEMP_CPP="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel_temp.cpp" + + cat > "${TEMP_CPP}" << 'EOF' /** * Standalone benchmark for ggml_gemm_i2_i8_s kernel * @@ -131,13 +238,20 @@ void run_benchmark(const BenchmarkConfig& config) { printf("Allocating matrices...\n"); // X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes - uint8_t* X = (uint8_t*)malloc(config.nc * config.n / 4); + // Align to 64 bytes for AVX-512, which is backward compatible with AVX2 (32 bytes) + size_t x_size = config.nc * config.n / 4; + size_t x_size_aligned = ((x_size + 63) / 64) * 64; + uint8_t* X = (uint8_t*)aligned_alloc(64, x_size_aligned); // Y matrix (i8 format): nr x n - int8_t* Y = (int8_t*)malloc(config.nr * config.n); + size_t y_size = config.nr * config.n; + size_t y_size_aligned = ((y_size + 63) / 64) * 64; + int8_t* Y = (int8_t*)aligned_alloc(64, y_size_aligned); // Result matrix (float32): nr x nc - float* S = (float*)malloc(config.nr * config.nc * sizeof(float)); + size_t s_size = config.nr * config.nc * sizeof(float); + size_t s_size_aligned = ((s_size + 63) / 64) * 64; + float* S = (float*)aligned_alloc(64, s_size_aligned); if (!X || !Y || !S) { fprintf(stderr, "Failed to allocate memory\n"); @@ -272,3 +386,188 @@ int main(int argc, char** argv) { return 0; } +EOF + + # Compiler flags + CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp" + CXXFLAGS+=" -I${SCRIPT_DIR}/.. -I${SCRIPT_DIR}/../include" + CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/include" + CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/src" + CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/include" + CXXFLAGS+=" -DNDEBUG -ffast-math" + + # Link flags + LDFLAGS="-lm -lpthread" + + # Link with pre-built libraries + GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src" + GGML_SO="${GGML_LIB_DIR}/libggml.so" + + if [ ! -f "${GGML_SO}" ]; then + echo "❌ Error: Cannot find libggml.so at ${GGML_SO}" + echo "Please build the project first with: cmake --build build" + rm -f "${TEMP_CPP}" + exit 1 + fi + + LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,${GGML_LIB_DIR}" + + # Output binary + BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel" + + echo "Compiler: ${CXX}" + echo "Building from embedded source..." + echo "" + + # Build + ${CXX} ${CXXFLAGS} "${TEMP_CPP}" -o ${BENCHMARK_BIN} ${LDFLAGS} + + if [ $? -eq 0 ]; then + echo "✅ Build successful!" + rm -f "${TEMP_CPP}" + echo "" + else + echo "❌ Build failed!" + rm -f "${TEMP_CPP}" + exit 1 + fi +else + echo "Step 1: Skipping build (using existing binary)" + echo "------------------------------------------" + BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel" + + if [ ! -f "${BENCHMARK_BIN}" ]; then + echo "❌ Error: Benchmark binary not found at ${BENCHMARK_BIN}" + echo "Please run without -s to build it first." + exit 1 + fi + echo "✅ Found existing binary" + echo "" +fi + +# Set LD_LIBRARY_PATH to include the GGML library directory +GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src" +export LD_LIBRARY_PATH="${GGML_LIB_DIR}:${LD_LIBRARY_PATH}" + +echo "Step 2: Running benchmark tests" +echo "------------------------------------------" +echo "Library path: ${GGML_LIB_DIR}" +echo "" + +# Write CSV header +echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$OUTPUT_CSV" +echo "Results will be saved to: $OUTPUT_CSV" +echo "" + +# Function to extract metrics and append to CSV +extract_and_save() { + local test_name="$1" + local output="$2" + + # Extract values using grep and awk + local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}') + local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}') + local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}') + local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}') + local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}') + local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}') + local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}') + local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}') + + # Check if values were extracted successfully + if [ -z "$avg_time" ] || [ -z "$min_time" ] || [ -z "$max_time" ]; then + echo "Warning: Failed to extract timing data for ${test_name}" + echo "${test_name},${n},${nr},${nc},N/A,N/A,N/A" >> "$OUTPUT_CSV" + return + fi + + # Calculate standard deviation estimate from range + # Using awk with proper variable passing + local std_time=$(awk -v min="$min_time" -v max="$max_time" 'BEGIN {printf "%.4f", (max - min) / 4}') + + # Format as mean±std + local time_formatted="${avg_time}±${std_time}" + + # Append to CSV + echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$OUTPUT_CSV" +} + +# Run benchmark tests +echo "==========================================" +echo "BitNet-2B Typical Shapes Performance Test" +echo "==========================================" +echo "" + +echo "Test 1: Single Token Generation (Attention QKV projection)" +echo " Scenario: Generating 1 token at a time" +echo " Shape: n=2048, r=1, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "single_token_gen" "$OUTPUT" +echo "" + +echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)" +echo " Scenario: Processing prompt with 128 tokens, batch size 1" +echo " Shape: n=2048, r=128, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "small_batch_prompt" "$OUTPUT" +echo "" + +echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)" +echo " Scenario: Processing prompt with 256 tokens or batch of 256" +echo " Shape: n=2048, r=256, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "medium_batch_prompt" "$OUTPUT" +echo "" + +echo "Test 4: Large Batch Processing (Attention QKV projection)" +echo " Scenario: Processing 512 tokens or batch of 512" +echo " Shape: n=2048, r=512, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "large_batch_prompt" "$OUTPUT" +echo "" + +echo "Test 5: FFN Up-projection (Small batch)" +echo " Scenario: Feed-forward network expansion, 128 tokens" +echo " Shape: n=2048, r=128, c=8192" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "ffn_up_projection" "$OUTPUT" +echo "" + +echo "Test 6: FFN Down-projection (Small batch)" +echo " Scenario: Feed-forward network reduction, 128 tokens" +echo " Shape: n=8192, r=128, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "ffn_down_projection" "$OUTPUT" +echo "" + +echo "Test 7: Long Context Processing" +echo " Scenario: Processing very long context (2048 tokens)" +echo " Shape: n=2048, r=2048, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "long_context" "$OUTPUT" +echo "" + +echo "Test 8: Batched Token Generation" +echo " Scenario: Generating tokens for 32 sequences simultaneously" +echo " Shape: n=2048, r=32, c=2048" +OUTPUT=$($BENCHMARK_BIN -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1) +echo "$OUTPUT" +extract_and_save "batched_token_gen" "$OUTPUT" +echo "" + +echo "==========================================" +echo "All tests completed successfully!" +echo "==========================================" +echo "Results saved to: $OUTPUT_CSV" +echo "" +echo "Summary:" +wc -l "$OUTPUT_CSV" | awk '{print " Total records:", $1 - 1}' +echo " Output file: $OUTPUT_CSV" +echo "==========================================" diff --git a/utils/test_parallel_strategy.sh b/utils/test_parallel_strategy.sh deleted file mode 100755 index 44da140..0000000 --- a/utils/test_parallel_strategy.sh +++ /dev/null @@ -1,277 +0,0 @@ -#!/bin/bash - -# Script: Test different GEMM parallel strategy performance -# Strategies: weight-parallel and no-parallel -# Thread counts: 1,2,4,8,12,16 - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -GEMM_CONFIG="$PROJECT_ROOT/include/gemm-config.h" -GEMM_CONFIG_BACKUP="$PROJECT_ROOT/include/gemm-config.h.bak" -BUILD_DIR="$PROJECT_ROOT/build" -STATS_DIR="$PROJECT_ROOT/stats" -CSV_FILE="$STATS_DIR/test_parallel_strategy_benchmark.csv" -MODEL_PATH="$PROJECT_ROOT/models/BitNet-b1.58-2B-4T/ggml-model-original.gguf" -BENCHMARK_CMD="./build/bin/llama-bench" -THREADS_LIST="1 2 4 8 12 16" - -# Color output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -log_info() { - echo -e "${GREEN}[INFO]${NC} $1" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# Check prerequisites -check_prerequisites() { - log_info "Checking prerequisites..." - - if [ ! -f "$GEMM_CONFIG" ]; then - log_error "gemm-config.h not found: $GEMM_CONFIG" - exit 1 - fi - - if [ ! -f "$MODEL_PATH" ]; then - log_error "Model file not found: $MODEL_PATH" - exit 1 - fi - - if [ ! -d "$BUILD_DIR" ]; then - log_error "Build directory not found: $BUILD_DIR" - exit 1 - fi - - if [ ! -f "$BUILD_DIR/bin/llama-bench" ]; then - log_warn "llama-bench executable not found, building..." - build_project - fi - - if [ ! -d "$STATS_DIR" ]; then - log_info "Creating stats directory..." - mkdir -p "$STATS_DIR" - fi - - log_info "Prerequisites check completed" -} - -# Backup original config file -backup_config() { - log_info "Backing up gemm-config.h..." - cp "$GEMM_CONFIG" "$GEMM_CONFIG_BACKUP" - log_info "Backup completed: $GEMM_CONFIG_BACKUP" -} - -# Restore original config file -restore_config() { - if [ -f "$GEMM_CONFIG_BACKUP" ]; then - log_info "Restoring original gemm-config.h..." - cp "$GEMM_CONFIG_BACKUP" "$GEMM_CONFIG" - rm "$GEMM_CONFIG_BACKUP" - log_info "Restore completed" - else - log_warn "Backup file not found, skipping restore" - fi -} - -# Set activation-parallel configuration (keep original ACT_PARALLEL) -set_activation_parallel() { - log_info "Configuration: activation-parallel (keeping #define ACT_PARALLEL)" - log_info "Configuration completed" -} - -# Set weight-parallel configuration (remove ACT_PARALLEL) -set_weight_parallel() { - log_info "Configuration: weight-parallel (removing #define ACT_PARALLEL)" - - # Remove ACT_PARALLEL definition - sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG" - - # Verify modification - if grep -q "^#define ACT_PARALLEL" "$GEMM_CONFIG"; then - log_error "Failed to remove ACT_PARALLEL" - exit 1 - fi - log_info "Configuration completed" -} - -# Set no-parallel configuration (remove ACT_PARALLEL + modify SIZE to 1) -set_no_parallel() { - log_info "Configuration: no-parallel (removing #define ACT_PARALLEL + modifying SIZE to 1)" - - # Remove ACT_PARALLEL definition - sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG" - - # Modify all ROW_BLOCK_SIZE and COL_BLOCK_SIZE to 1 - sed -i 's/#define ROW_BLOCK_SIZE [0-9]\+/#define ROW_BLOCK_SIZE 1/g' "$GEMM_CONFIG" - sed -i 's/#define COL_BLOCK_SIZE [0-9]\+/#define COL_BLOCK_SIZE 1/g' "$GEMM_CONFIG" - - log_info "Configuration completed" -} - -# Build project -build_project() { - log_info "Building project..." - cd "$PROJECT_ROOT" - - if [ ! -f "$BUILD_DIR/Makefile" ]; then - log_info "First build, running cmake..." - cmake -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=Release > /dev/null 2>&1 - fi - - cd "$BUILD_DIR" - make -j$(nproc) llama-bench > /dev/null 2>&1 - - if [ ! -f "./bin/llama-bench" ]; then - log_error "Build failed" - exit 1 - fi - - log_info "Build completed" - cd "$PROJECT_ROOT" -} - -# Run benchmark test -run_benchmark() { - local strategy=$1 - local threads=$2 - - cd "$PROJECT_ROOT" - - # Run llama-bench - local output=$($BENCHMARK_CMD -m "$MODEL_PATH" -p 128 -n 0 -t "$threads" -ngl 0 2>&1) - - # Extract line containing "pp128" - local line=$(echo "$output" | grep "pp128" | tail -1) - - if [ -z "$line" ]; then - return 1 - fi - - echo "$line" -} - -# Extract throughput value from benchmark output -extract_throughput() { - local line=$1 - - # Remove any leading/trailing whitespace and log messages - # The line format is: | model | size | params | backend | threads | test | throughput | - # We need to extract the last field which contains the throughput in format "XXX.XX ± YY.YY" - local throughput=$(echo "$line" | awk -F'|' '{print $NF}' | xargs | sed 's/\[.*\]//' | xargs) - - echo "$throughput" -} - -# Initialize CSV file -init_csv() { - log_info "Initializing CSV file: $CSV_FILE" - - cat > "$CSV_FILE" << 'EOF' -Strategy,Threads,Throughput -EOF - - log_info "CSV file created" -} - -# Add result to CSV -add_to_csv() { - local strategy=$1 - local threads=$2 - local throughput=$3 - - echo "$strategy,$threads,$throughput" >> "$CSV_FILE" -} - -# Main function -main() { - log_info "Starting GEMM parallel strategy benchmark tests" - log_info "================================================" - - # Check prerequisites - check_prerequisites - - # Backup original configuration - backup_config - - # Initialize CSV file - init_csv - - # Define strategies to test - local strategies=("activation-parallel" "weight-parallel" "no-parallel") - - for strategy in "${strategies[@]}"; do - log_info "================================================" - log_info "Testing strategy: $strategy" - log_info "================================================" - - # Restore to original configuration - restore_config - backup_config - - # Apply configuration based on strategy - case $strategy in - activation-parallel) - set_activation_parallel - ;; - weight-parallel) - set_weight_parallel - ;; - no-parallel) - set_no_parallel - ;; - esac - - # Rebuild project to apply new configuration - log_info "Rebuilding project to apply new configuration..." - build_project - - # Run test for each thread count - for threads in $THREADS_LIST; do - log_info "" - log_info "Strategy: $strategy, Threads: $threads" - - # Run test (capture only output, not log messages) - local result=$(run_benchmark "$strategy" "$threads") - local test_status=$? - - if [ $test_status -eq 0 ]; then - # Extract throughput value from the result line - local throughput=$(extract_throughput "$result") - log_info "Throughput: $throughput" - - # Add to CSV - add_to_csv "$strategy" "$threads" "$throughput" - else - log_warn "Test failed for strategy $strategy, threads $threads" - fi - - sleep 2 # Give system time to cool down - done - done - - # Restore original configuration - restore_config - - log_info "================================================" - log_info "Test completed!" - log_info "Results saved to: $CSV_FILE" - log_info "================================================" - - # Display CSV content - log_info "CSV file content:" - cat "$CSV_FILE" -} - -# Run main function -main "$@" diff --git a/utils/test_typical_shapes.sh b/utils/test_typical_shapes.sh deleted file mode 100755 index 6ad805c..0000000 --- a/utils/test_typical_shapes.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# Test typical matrix shapes for BitNet-2B model -# Based on BitNet-b1.58-2B-4T architecture - -echo "==========================================" -echo "BitNet-2B Typical Shapes Performance Test" -echo "==========================================" -echo "" - -ITERATIONS=1000 -BENCHMARK="../build/test_gemm_kernel" - -# Create stats directory if not exists -mkdir -p ../stats - -# Generate output CSV filename -CSV_FILE="../stats/gemm_kernel_test_noparal.csv" - -# Write CSV header -echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$CSV_FILE" -echo "Results will be saved to: $CSV_FILE" -echo "" - -# Function to extract metrics and append to CSV -extract_and_save() { - local test_name="$1" - local output="$2" - - # Extract values using grep and awk - local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}') - local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}') - local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}') - local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}') - local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}') - local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}') - local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}') - local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}') - - # Calculate standard deviation estimate from range (assuming ~95% of data within min-max) - # For normal distribution, range ≈ 4*std, so std ≈ range/4 - local std_time=$(echo "scale=4; ($max_time - $min_time) / 4" | bc) - - # Format as mean±std - local time_formatted="${avg_time}±${std_time}" - - # For GFLOPS and throughput, we don't have std info, so just use the value - # If you want to estimate std for these as well, you would need more data - - # Append to CSV - echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$CSV_FILE" -} - -echo "Test 1: Single Token Generation (Attention QKV projection)" -echo " Scenario: Generating 1 token at a time" -echo " Shape: n=2048, r=1, c=2048" -OUTPUT=$($BENCHMARK -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "single_token_gen" "$OUTPUT" -echo "" - -echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)" -echo " Scenario: Processing prompt with 128 tokens, batch size 1" -echo " Shape: n=2048, r=128, c=2048" -OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "small_batch_prompt" "$OUTPUT" -echo "" - -echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)" -echo " Scenario: Processing prompt with 256 tokens or batch of 256" -echo " Shape: n=2048, r=256, c=2048" -OUTPUT=$($BENCHMARK -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "medium_batch_prompt" "$OUTPUT" -echo "" - -echo "Test 4: Large Batch Processing (Attention QKV projection)" -echo " Scenario: Processing 512 tokens or batch of 512" -echo " Shape: n=2048, r=512, c=2048" -OUTPUT=$($BENCHMARK -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "large_batch_prompt" "$OUTPUT" -echo "" - -echo "Test 5: FFN Up-projection (Small batch)" -echo " Scenario: Feed-forward network expansion, 128 tokens" -echo " Shape: n=2048, r=128, c=8192" -OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "ffn_up_projection" "$OUTPUT" -echo "" - -echo "Test 6: FFN Down-projection (Small batch)" -echo " Scenario: Feed-forward network reduction, 128 tokens" -echo " Shape: n=8192, r=128, c=2048" -OUTPUT=$($BENCHMARK -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "ffn_down_projection" "$OUTPUT" -echo "" - -echo "Test 7: Long Context Processing" -echo " Scenario: Processing very long context (2048 tokens)" -echo " Shape: n=2048, r=2048, c=2048" -OUTPUT=$($BENCHMARK -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "long_context" "$OUTPUT" -echo "" - -echo "Test 8: Batched Token Generation" -echo " Scenario: Generating tokens for 32 sequences simultaneously" -echo " Shape: n=2048, r=32, c=2048" -OUTPUT=$($BENCHMARK -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1) -echo "$OUTPUT" -extract_and_save "batched_token_gen" "$OUTPUT" -echo "" - -echo "==========================================" -echo "All tests completed!" -echo "Results saved to: $CSV_FILE" -echo "==========================================" diff --git a/utils/tune_gemm_config.py b/utils/tune_gemm_config.py index 83b4218..e537cd8 100644 --- a/utils/tune_gemm_config.py +++ b/utils/tune_gemm_config.py @@ -35,85 +35,16 @@ class GemmTuner: shutil.copy2(self.backup_path, self.config_path) def generate_config(self, act_parallel, row_block_size, col_block_size, parallel_size): - """Generate new configuration file""" + """Generate new configuration file with simplified format""" content = "" - # ACT_PARALLEL definition + # Simplified configuration format if act_parallel: content += "#define ACT_PARALLEL\n" - else: - content += "// #define ACT_PARALLEL\n" - # Detect architecture branches in original config file - with open(self.backup_path, 'r') as f: - original = f.read() - - has_avx = "__AVX__" in original or "__AVX2__" in original - has_arm = "__ARM_NEON" in original - - # If architecture detection exists, generate corresponding branches - if has_avx and has_arm: - # Multi-architecture configuration - content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n" - content += "#if defined(ACT_PARALLEL)\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#else\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#endif\n" - content += "#elif defined(__ARM_NEON)\n" - content += "#if defined(ACT_PARALLEL)\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#else\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#endif\n" - content += "#endif\n" - elif has_avx: - # AVX architecture only - content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n" - content += "#if defined(ACT_PARALLEL)\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#else\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#endif\n" - content += "#endif\n" - elif has_arm: - # ARM architecture only - content += "#if defined(__ARM_NEON)\n" - content += "#if defined(ACT_PARALLEL)\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#else\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#endif\n" - content += "#endif\n" - else: - # No architecture detection, define directly - content += "#if defined(ACT_PARALLEL)\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#else\n" - content += f" #define ROW_BLOCK_SIZE {row_block_size}\n" - content += f" #define COL_BLOCK_SIZE {col_block_size}\n" - content += f" #define PARALLEL_SIZE {parallel_size}\n" - content += "#endif\n" - - content += "\n" + content += f"#define ROW_BLOCK_SIZE {row_block_size}\n" + content += f"#define COL_BLOCK_SIZE {col_block_size}\n" + content += f"#define PARALLEL_SIZE {parallel_size}\n" with open(self.config_path, 'w') as f: f.write(content) @@ -259,9 +190,12 @@ class GemmTuner: # Save results if output_csv is None: timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - csv_path = f"stats/tuning_results_{timestamp}.csv" + csv_path = f"../stats/tuning_results_{timestamp}.csv" else: csv_path = output_csv + + # Ensure stats directory exists + os.makedirs(os.path.dirname(csv_path), exist_ok=True) self.save_results(csv_path) # Find best configuration @@ -278,8 +212,18 @@ class GemmTuner: print(f"PP128 Throughput: {best['pp_throughput']:.2f} ± {best['pp_std_dev']:.2f} t/s") print(f"{'='*80}\n") + # Show the configuration that will be written + print("Configuration to be written to gemm-config.h:") + print("-" * 80) + if best['act_parallel']: + print("#define ACT_PARALLEL") + print(f"#define ROW_BLOCK_SIZE {best['row_block_size']}") + print(f"#define COL_BLOCK_SIZE {best['col_block_size']}") + print(f"#define PARALLEL_SIZE {best['parallel_size']}") + print("-" * 80) + # Apply best configuration - apply = input("Do you want to apply this configuration? (y/n): ").strip().lower() + apply = input("\nDo you want to apply this configuration to gemm-config.h? (y/n): ").strip().lower() if apply == 'y': self.generate_config( best['act_parallel'], @@ -288,17 +232,30 @@ class GemmTuner: best['parallel_size'] ) self.rebuild_project() - print("✅ Best configuration applied!") + print("✅ Best configuration applied and project rebuilt!") else: self.restore_config() print("✅ Original configuration restored") + + # Clean up backup file + if self.backup_path.exists(): + self.backup_path.unlink() + print(f"🗑️ Removed backup file: {self.backup_path}") except KeyboardInterrupt: print("\n⚠️ Tuning interrupted by user") self.restore_config() + # Clean up backup file + if self.backup_path.exists(): + self.backup_path.unlink() + print(f"🗑️ Removed backup file: {self.backup_path}") except Exception as e: print(f"\n❌ Error during tuning: {e}") self.restore_config() + # Clean up backup file + if self.backup_path.exists(): + self.backup_path.unlink() + print(f"🗑️ Removed backup file: {self.backup_path}") raise @@ -308,9 +265,9 @@ def generate_configurations(): act_parallel_options = [True] - row_sizes = [2, 4, 8, 16, 32] - col_sizes = [32, 64, 128, 256, 512, 1024] - parallelism_degree = [2, 4, 8] + row_sizes = [2, 4, 8]#[2, 4, 8, 16, 32] + col_sizes = [32, 64]#[32, 64, 128, 256, 512, 1024] + parallelism_degree = [4] for act_parallel in act_parallel_options: for row in row_sizes: