[modify] some utils test script

This commit is contained in:
XSquirrelC
2026-01-24 08:40:36 +00:00
parent 2fed9af730
commit 7b2c52b9d5
6 changed files with 339 additions and 556 deletions
-76
View File
@@ -1,76 +0,0 @@
#!/bin/bash
# Build script for standalone GEMM kernel benchmark
set -e
echo "Building GEMM kernel benchmark..."
# Compiler settings
CXX=${CXX:-g++}
BUILD_DIR="../build"
SRC_DIR="../src"
# Create build directory if it doesn't exist
mkdir -p ${BUILD_DIR}
# Compiler flags
CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
CXXFLAGS+=" -I.. -I../include"
CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/include"
CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/src"
CXXFLAGS+=" -I../3rdparty/llama.cpp/include"
CXXFLAGS+=" -DNDEBUG -ffast-math"
# Link flags
LDFLAGS="-lm -lpthread"
# Link with pre-built libraries
GGML_LIB_DIR="../build/3rdparty/llama.cpp/ggml/src"
GGML_SO="${GGML_LIB_DIR}/libggml.so"
if [ ! -f "${GGML_SO}" ]; then
echo "⚠️ Warning: Cannot find libggml.so"
echo "Please build the project first with: cmake --build build"
exit 1
fi
LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,\$ORIGIN/../../${GGML_LIB_DIR}"
echo "Linking with libggml.so"
# Source files
SOURCES="./test_gemm_kernel.cpp"
# Output binary
OUTPUT="${BUILD_DIR}/test_gemm_kernel"
echo "Compiler: ${CXX}"
echo "Flags: ${CXXFLAGS}"
echo "Sources: ${SOURCES}"
echo ""
# Build
${CXX} ${CXXFLAGS} ${SOURCES} -o ${OUTPUT} ${LDFLAGS}
if [ $? -eq 0 ]; then
echo ""
echo "✅ Build successful!"
echo "Output: ${OUTPUT}"
echo ""
echo "Usage examples:"
echo " # Default test (n=2048, nr=32, nc=128, 1000 iterations)"
echo " ${OUTPUT}"
echo ""
echo " # Custom matrix sizes"
echo " ${OUTPUT} -n 4096 -r 64 -c 256"
echo ""
echo " # Quick test (fewer iterations)"
echo " ${OUTPUT} -i 100 -w 5"
echo ""
echo " # Large-scale test"
echo " ${OUTPUT} -n 3200 -r 128 -c 512 -i 500"
echo ""
else
echo ""
echo "❌ Build failed!"
exit 1
fi
View File
+302 -3
View File
@@ -1,3 +1,110 @@
#!/bin/bash
# Unified GEMM kernel benchmark script
# Builds, tests, and benchmarks the GEMM kernel with configurable output
set -e
# Default values
BUILD_DIR="../build"
ITERATIONS=1000
OUTPUT_CSV=""
SKIP_BUILD=false
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Print usage
print_usage() {
cat << EOF
Usage: $0 [options]
Options:
-o, --output <path> Output CSV file path (default: ../stats/gemm_kernel_test_noparal.csv)
-i, --iterations <num> Number of iterations per test (default: 1000)
-s, --skip-build Skip building the benchmark binary
-h, --help Show this help message
Examples:
# Run with default settings
$0
# Specify custom output file
$0 -o /path/to/my_results.csv
# Quick test with fewer iterations
$0 -i 100 -o quick_test.csv
# Skip build if already compiled
$0 -s -o results.csv
EOF
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-o|--output)
OUTPUT_CSV="$2"
shift 2
;;
-i|--iterations)
ITERATIONS="$2"
shift 2
;;
-s|--skip-build)
SKIP_BUILD=true
shift
;;
-h|--help)
print_usage
exit 0
;;
*)
echo "Unknown option: $1"
print_usage
exit 1
;;
esac
done
# Set default output CSV if not specified
if [ -z "$OUTPUT_CSV" ]; then
OUTPUT_CSV="${SCRIPT_DIR}/../stats/gemm_kernel_test_noparal.csv"
fi
# Create output directory first
mkdir -p "$(dirname "$OUTPUT_CSV")"
# Convert to absolute path
if [[ "$OUTPUT_CSV" = /* ]]; then
# Already absolute path
OUTPUT_CSV="$OUTPUT_CSV"
else
# Convert relative path to absolute
OUTPUT_CSV="$(cd "$(dirname "$OUTPUT_CSV")" && pwd)/$(basename "$OUTPUT_CSV")"
fi
echo "=========================================="
echo "GEMM Kernel Benchmark Suite"
echo "=========================================="
echo "Configuration:"
echo " Iterations: $ITERATIONS"
echo " Output CSV: $OUTPUT_CSV"
echo " Skip build: $SKIP_BUILD"
echo "=========================================="
echo ""
# Build the benchmark binary
if [ "$SKIP_BUILD" = false ]; then
echo "Step 1: Building GEMM kernel benchmark..."
echo "------------------------------------------"
CXX=${CXX:-g++}
# Create build directory if it doesn't exist
mkdir -p "${SCRIPT_DIR}/${BUILD_DIR}"
# Create temporary C++ source file
TEMP_CPP="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel_temp.cpp"
cat > "${TEMP_CPP}" << 'EOF'
/**
* Standalone benchmark for ggml_gemm_i2_i8_s kernel
*
@@ -131,13 +238,20 @@ void run_benchmark(const BenchmarkConfig& config) {
printf("Allocating matrices...\n");
// X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes
uint8_t* X = (uint8_t*)malloc(config.nc * config.n / 4);
// Align to 64 bytes for AVX-512, which is backward compatible with AVX2 (32 bytes)
size_t x_size = config.nc * config.n / 4;
size_t x_size_aligned = ((x_size + 63) / 64) * 64;
uint8_t* X = (uint8_t*)aligned_alloc(64, x_size_aligned);
// Y matrix (i8 format): nr x n
int8_t* Y = (int8_t*)malloc(config.nr * config.n);
size_t y_size = config.nr * config.n;
size_t y_size_aligned = ((y_size + 63) / 64) * 64;
int8_t* Y = (int8_t*)aligned_alloc(64, y_size_aligned);
// Result matrix (float32): nr x nc
float* S = (float*)malloc(config.nr * config.nc * sizeof(float));
size_t s_size = config.nr * config.nc * sizeof(float);
size_t s_size_aligned = ((s_size + 63) / 64) * 64;
float* S = (float*)aligned_alloc(64, s_size_aligned);
if (!X || !Y || !S) {
fprintf(stderr, "Failed to allocate memory\n");
@@ -272,3 +386,188 @@ int main(int argc, char** argv) {
return 0;
}
EOF
# Compiler flags
CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
CXXFLAGS+=" -I${SCRIPT_DIR}/.. -I${SCRIPT_DIR}/../include"
CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/include"
CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/src"
CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/include"
CXXFLAGS+=" -DNDEBUG -ffast-math"
# Link flags
LDFLAGS="-lm -lpthread"
# Link with pre-built libraries
GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
GGML_SO="${GGML_LIB_DIR}/libggml.so"
if [ ! -f "${GGML_SO}" ]; then
echo "❌ Error: Cannot find libggml.so at ${GGML_SO}"
echo "Please build the project first with: cmake --build build"
rm -f "${TEMP_CPP}"
exit 1
fi
LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,${GGML_LIB_DIR}"
# Output binary
BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"
echo "Compiler: ${CXX}"
echo "Building from embedded source..."
echo ""
# Build
${CXX} ${CXXFLAGS} "${TEMP_CPP}" -o ${BENCHMARK_BIN} ${LDFLAGS}
if [ $? -eq 0 ]; then
echo "✅ Build successful!"
rm -f "${TEMP_CPP}"
echo ""
else
echo "❌ Build failed!"
rm -f "${TEMP_CPP}"
exit 1
fi
else
echo "Step 1: Skipping build (using existing binary)"
echo "------------------------------------------"
BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"
if [ ! -f "${BENCHMARK_BIN}" ]; then
echo "❌ Error: Benchmark binary not found at ${BENCHMARK_BIN}"
echo "Please run without -s to build it first."
exit 1
fi
echo "✅ Found existing binary"
echo ""
fi
# Set LD_LIBRARY_PATH to include the GGML library directory
GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
export LD_LIBRARY_PATH="${GGML_LIB_DIR}:${LD_LIBRARY_PATH}"
echo "Step 2: Running benchmark tests"
echo "------------------------------------------"
echo "Library path: ${GGML_LIB_DIR}"
echo ""
# Write CSV header
echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$OUTPUT_CSV"
echo "Results will be saved to: $OUTPUT_CSV"
echo ""
# Function to extract metrics and append to CSV
extract_and_save() {
local test_name="$1"
local output="$2"
# Extract values using grep and awk
local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
# Check if values were extracted successfully
if [ -z "$avg_time" ] || [ -z "$min_time" ] || [ -z "$max_time" ]; then
echo "Warning: Failed to extract timing data for ${test_name}"
echo "${test_name},${n},${nr},${nc},N/A,N/A,N/A" >> "$OUTPUT_CSV"
return
fi
# Calculate standard deviation estimate from range
# Using awk with proper variable passing
local std_time=$(awk -v min="$min_time" -v max="$max_time" 'BEGIN {printf "%.4f", (max - min) / 4}')
# Format as mean±std
local time_formatted="${avg_time}±${std_time}"
# Append to CSV
echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$OUTPUT_CSV"
}
# Run benchmark tests
echo "=========================================="
echo "BitNet-2B Typical Shapes Performance Test"
echo "=========================================="
echo ""
echo "Test 1: Single Token Generation (Attention QKV projection)"
echo " Scenario: Generating 1 token at a time"
echo " Shape: n=2048, r=1, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "single_token_gen" "$OUTPUT"
echo ""
echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
echo " Scenario: Processing prompt with 128 tokens, batch size 1"
echo " Shape: n=2048, r=128, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "small_batch_prompt" "$OUTPUT"
echo ""
echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
echo " Scenario: Processing prompt with 256 tokens or batch of 256"
echo " Shape: n=2048, r=256, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "medium_batch_prompt" "$OUTPUT"
echo ""
echo "Test 4: Large Batch Processing (Attention QKV projection)"
echo " Scenario: Processing 512 tokens or batch of 512"
echo " Shape: n=2048, r=512, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "large_batch_prompt" "$OUTPUT"
echo ""
echo "Test 5: FFN Up-projection (Small batch)"
echo " Scenario: Feed-forward network expansion, 128 tokens"
echo " Shape: n=2048, r=128, c=8192"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "ffn_up_projection" "$OUTPUT"
echo ""
echo "Test 6: FFN Down-projection (Small batch)"
echo " Scenario: Feed-forward network reduction, 128 tokens"
echo " Shape: n=8192, r=128, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "ffn_down_projection" "$OUTPUT"
echo ""
echo "Test 7: Long Context Processing"
echo " Scenario: Processing very long context (2048 tokens)"
echo " Shape: n=2048, r=2048, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "long_context" "$OUTPUT"
echo ""
echo "Test 8: Batched Token Generation"
echo " Scenario: Generating tokens for 32 sequences simultaneously"
echo " Shape: n=2048, r=32, c=2048"
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "batched_token_gen" "$OUTPUT"
echo ""
echo "=========================================="
echo "All tests completed successfully!"
echo "=========================================="
echo "Results saved to: $OUTPUT_CSV"
echo ""
echo "Summary:"
wc -l "$OUTPUT_CSV" | awk '{print " Total records:", $1 - 1}'
echo " Output file: $OUTPUT_CSV"
echo "=========================================="
-277
View File
@@ -1,277 +0,0 @@
#!/bin/bash
# Script: Test different GEMM parallel strategy performance
# Strategies: weight-parallel and no-parallel
# Thread counts: 1,2,4,8,12,16
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
GEMM_CONFIG="$PROJECT_ROOT/include/gemm-config.h"
GEMM_CONFIG_BACKUP="$PROJECT_ROOT/include/gemm-config.h.bak"
BUILD_DIR="$PROJECT_ROOT/build"
STATS_DIR="$PROJECT_ROOT/stats"
CSV_FILE="$STATS_DIR/test_parallel_strategy_benchmark.csv"
MODEL_PATH="$PROJECT_ROOT/models/BitNet-b1.58-2B-4T/ggml-model-original.gguf"
BENCHMARK_CMD="./build/bin/llama-bench"
THREADS_LIST="1 2 4 8 12 16"
# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Check prerequisites
check_prerequisites() {
log_info "Checking prerequisites..."
if [ ! -f "$GEMM_CONFIG" ]; then
log_error "gemm-config.h not found: $GEMM_CONFIG"
exit 1
fi
if [ ! -f "$MODEL_PATH" ]; then
log_error "Model file not found: $MODEL_PATH"
exit 1
fi
if [ ! -d "$BUILD_DIR" ]; then
log_error "Build directory not found: $BUILD_DIR"
exit 1
fi
if [ ! -f "$BUILD_DIR/bin/llama-bench" ]; then
log_warn "llama-bench executable not found, building..."
build_project
fi
if [ ! -d "$STATS_DIR" ]; then
log_info "Creating stats directory..."
mkdir -p "$STATS_DIR"
fi
log_info "Prerequisites check completed"
}
# Backup original config file
backup_config() {
log_info "Backing up gemm-config.h..."
cp "$GEMM_CONFIG" "$GEMM_CONFIG_BACKUP"
log_info "Backup completed: $GEMM_CONFIG_BACKUP"
}
# Restore original config file
restore_config() {
if [ -f "$GEMM_CONFIG_BACKUP" ]; then
log_info "Restoring original gemm-config.h..."
cp "$GEMM_CONFIG_BACKUP" "$GEMM_CONFIG"
rm "$GEMM_CONFIG_BACKUP"
log_info "Restore completed"
else
log_warn "Backup file not found, skipping restore"
fi
}
# Set activation-parallel configuration (keep original ACT_PARALLEL)
set_activation_parallel() {
log_info "Configuration: activation-parallel (keeping #define ACT_PARALLEL)"
log_info "Configuration completed"
}
# Set weight-parallel configuration (remove ACT_PARALLEL)
set_weight_parallel() {
log_info "Configuration: weight-parallel (removing #define ACT_PARALLEL)"
# Remove ACT_PARALLEL definition
sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
# Verify modification
if grep -q "^#define ACT_PARALLEL" "$GEMM_CONFIG"; then
log_error "Failed to remove ACT_PARALLEL"
exit 1
fi
log_info "Configuration completed"
}
# Set no-parallel configuration (remove ACT_PARALLEL + modify SIZE to 1)
set_no_parallel() {
log_info "Configuration: no-parallel (removing #define ACT_PARALLEL + modifying SIZE to 1)"
# Remove ACT_PARALLEL definition
sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
# Modify all ROW_BLOCK_SIZE and COL_BLOCK_SIZE to 1
sed -i 's/#define ROW_BLOCK_SIZE [0-9]\+/#define ROW_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
sed -i 's/#define COL_BLOCK_SIZE [0-9]\+/#define COL_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
log_info "Configuration completed"
}
# Build project
build_project() {
log_info "Building project..."
cd "$PROJECT_ROOT"
if [ ! -f "$BUILD_DIR/Makefile" ]; then
log_info "First build, running cmake..."
cmake -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=Release > /dev/null 2>&1
fi
cd "$BUILD_DIR"
make -j$(nproc) llama-bench > /dev/null 2>&1
if [ ! -f "./bin/llama-bench" ]; then
log_error "Build failed"
exit 1
fi
log_info "Build completed"
cd "$PROJECT_ROOT"
}
# Run benchmark test
run_benchmark() {
local strategy=$1
local threads=$2
cd "$PROJECT_ROOT"
# Run llama-bench
local output=$($BENCHMARK_CMD -m "$MODEL_PATH" -p 128 -n 0 -t "$threads" -ngl 0 2>&1)
# Extract line containing "pp128"
local line=$(echo "$output" | grep "pp128" | tail -1)
if [ -z "$line" ]; then
return 1
fi
echo "$line"
}
# Extract throughput value from benchmark output
extract_throughput() {
local line=$1
# Remove any leading/trailing whitespace and log messages
# The line format is: | model | size | params | backend | threads | test | throughput |
# We need to extract the last field which contains the throughput in format "XXX.XX ± YY.YY"
local throughput=$(echo "$line" | awk -F'|' '{print $NF}' | xargs | sed 's/\[.*\]//' | xargs)
echo "$throughput"
}
# Initialize CSV file
init_csv() {
log_info "Initializing CSV file: $CSV_FILE"
cat > "$CSV_FILE" << 'EOF'
Strategy,Threads,Throughput
EOF
log_info "CSV file created"
}
# Add result to CSV
add_to_csv() {
local strategy=$1
local threads=$2
local throughput=$3
echo "$strategy,$threads,$throughput" >> "$CSV_FILE"
}
# Main function
main() {
log_info "Starting GEMM parallel strategy benchmark tests"
log_info "================================================"
# Check prerequisites
check_prerequisites
# Backup original configuration
backup_config
# Initialize CSV file
init_csv
# Define strategies to test
local strategies=("activation-parallel" "weight-parallel" "no-parallel")
for strategy in "${strategies[@]}"; do
log_info "================================================"
log_info "Testing strategy: $strategy"
log_info "================================================"
# Restore to original configuration
restore_config
backup_config
# Apply configuration based on strategy
case $strategy in
activation-parallel)
set_activation_parallel
;;
weight-parallel)
set_weight_parallel
;;
no-parallel)
set_no_parallel
;;
esac
# Rebuild project to apply new configuration
log_info "Rebuilding project to apply new configuration..."
build_project
# Run test for each thread count
for threads in $THREADS_LIST; do
log_info ""
log_info "Strategy: $strategy, Threads: $threads"
# Run test (capture only output, not log messages)
local result=$(run_benchmark "$strategy" "$threads")
local test_status=$?
if [ $test_status -eq 0 ]; then
# Extract throughput value from the result line
local throughput=$(extract_throughput "$result")
log_info "Throughput: $throughput"
# Add to CSV
add_to_csv "$strategy" "$threads" "$throughput"
else
log_warn "Test failed for strategy $strategy, threads $threads"
fi
sleep 2 # Give system time to cool down
done
done
# Restore original configuration
restore_config
log_info "================================================"
log_info "Test completed!"
log_info "Results saved to: $CSV_FILE"
log_info "================================================"
# Display CSV content
log_info "CSV file content:"
cat "$CSV_FILE"
}
# Run main function
main "$@"
-120
View File
@@ -1,120 +0,0 @@
#!/bin/bash
# Test typical matrix shapes for BitNet-2B model
# Based on BitNet-b1.58-2B-4T architecture
echo "=========================================="
echo "BitNet-2B Typical Shapes Performance Test"
echo "=========================================="
echo ""
ITERATIONS=1000
BENCHMARK="../build/test_gemm_kernel"
# Create stats directory if not exists
mkdir -p ../stats
# Generate output CSV filename
CSV_FILE="../stats/gemm_kernel_test_noparal.csv"
# Write CSV header
echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$CSV_FILE"
echo "Results will be saved to: $CSV_FILE"
echo ""
# Function to extract metrics and append to CSV
extract_and_save() {
local test_name="$1"
local output="$2"
# Extract values using grep and awk
local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
# Calculate standard deviation estimate from range (assuming ~95% of data within min-max)
# For normal distribution, range ≈ 4*std, so std ≈ range/4
local std_time=$(echo "scale=4; ($max_time - $min_time) / 4" | bc)
# Format as mean±std
local time_formatted="${avg_time}±${std_time}"
# For GFLOPS and throughput, we don't have std info, so just use the value
# If you want to estimate std for these as well, you would need more data
# Append to CSV
echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$CSV_FILE"
}
echo "Test 1: Single Token Generation (Attention QKV projection)"
echo " Scenario: Generating 1 token at a time"
echo " Shape: n=2048, r=1, c=2048"
OUTPUT=$($BENCHMARK -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "single_token_gen" "$OUTPUT"
echo ""
echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
echo " Scenario: Processing prompt with 128 tokens, batch size 1"
echo " Shape: n=2048, r=128, c=2048"
OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "small_batch_prompt" "$OUTPUT"
echo ""
echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
echo " Scenario: Processing prompt with 256 tokens or batch of 256"
echo " Shape: n=2048, r=256, c=2048"
OUTPUT=$($BENCHMARK -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "medium_batch_prompt" "$OUTPUT"
echo ""
echo "Test 4: Large Batch Processing (Attention QKV projection)"
echo " Scenario: Processing 512 tokens or batch of 512"
echo " Shape: n=2048, r=512, c=2048"
OUTPUT=$($BENCHMARK -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "large_batch_prompt" "$OUTPUT"
echo ""
echo "Test 5: FFN Up-projection (Small batch)"
echo " Scenario: Feed-forward network expansion, 128 tokens"
echo " Shape: n=2048, r=128, c=8192"
OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "ffn_up_projection" "$OUTPUT"
echo ""
echo "Test 6: FFN Down-projection (Small batch)"
echo " Scenario: Feed-forward network reduction, 128 tokens"
echo " Shape: n=8192, r=128, c=2048"
OUTPUT=$($BENCHMARK -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "ffn_down_projection" "$OUTPUT"
echo ""
echo "Test 7: Long Context Processing"
echo " Scenario: Processing very long context (2048 tokens)"
echo " Shape: n=2048, r=2048, c=2048"
OUTPUT=$($BENCHMARK -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "long_context" "$OUTPUT"
echo ""
echo "Test 8: Batched Token Generation"
echo " Scenario: Generating tokens for 32 sequences simultaneously"
echo " Shape: n=2048, r=32, c=2048"
OUTPUT=$($BENCHMARK -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
echo "$OUTPUT"
extract_and_save "batched_token_gen" "$OUTPUT"
echo ""
echo "=========================================="
echo "All tests completed!"
echo "Results saved to: $CSV_FILE"
echo "=========================================="
+37 -80
View File
@@ -35,85 +35,16 @@ class GemmTuner:
shutil.copy2(self.backup_path, self.config_path)
def generate_config(self, act_parallel, row_block_size, col_block_size, parallel_size):
"""Generate new configuration file"""
"""Generate new configuration file with simplified format"""
content = ""
# ACT_PARALLEL definition
# Simplified configuration format
if act_parallel:
content += "#define ACT_PARALLEL\n"
else:
content += "// #define ACT_PARALLEL\n"
# Detect architecture branches in original config file
with open(self.backup_path, 'r') as f:
original = f.read()
has_avx = "__AVX__" in original or "__AVX2__" in original
has_arm = "__ARM_NEON" in original
# If architecture detection exists, generate corresponding branches
if has_avx and has_arm:
# Multi-architecture configuration
content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
content += "#if defined(ACT_PARALLEL)\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#else\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#endif\n"
content += "#elif defined(__ARM_NEON)\n"
content += "#if defined(ACT_PARALLEL)\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#else\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#endif\n"
content += "#endif\n"
elif has_avx:
# AVX architecture only
content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
content += "#if defined(ACT_PARALLEL)\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#else\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#endif\n"
content += "#endif\n"
elif has_arm:
# ARM architecture only
content += "#if defined(__ARM_NEON)\n"
content += "#if defined(ACT_PARALLEL)\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#else\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#endif\n"
content += "#endif\n"
else:
# No architecture detection, define directly
content += "#if defined(ACT_PARALLEL)\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#else\n"
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
content += f" #define PARALLEL_SIZE {parallel_size}\n"
content += "#endif\n"
content += "\n"
content += f"#define ROW_BLOCK_SIZE {row_block_size}\n"
content += f"#define COL_BLOCK_SIZE {col_block_size}\n"
content += f"#define PARALLEL_SIZE {parallel_size}\n"
with open(self.config_path, 'w') as f:
f.write(content)
@@ -259,9 +190,12 @@ class GemmTuner:
# Save results
if output_csv is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_path = f"stats/tuning_results_{timestamp}.csv"
csv_path = f"../stats/tuning_results_{timestamp}.csv"
else:
csv_path = output_csv
# Ensure stats directory exists
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
self.save_results(csv_path)
# Find best configuration
@@ -278,8 +212,18 @@ class GemmTuner:
print(f"PP128 Throughput: {best['pp_throughput']:.2f} ± {best['pp_std_dev']:.2f} t/s")
print(f"{'='*80}\n")
# Show the configuration that will be written
print("Configuration to be written to gemm-config.h:")
print("-" * 80)
if best['act_parallel']:
print("#define ACT_PARALLEL")
print(f"#define ROW_BLOCK_SIZE {best['row_block_size']}")
print(f"#define COL_BLOCK_SIZE {best['col_block_size']}")
print(f"#define PARALLEL_SIZE {best['parallel_size']}")
print("-" * 80)
# Apply best configuration
apply = input("Do you want to apply this configuration? (y/n): ").strip().lower()
apply = input("\nDo you want to apply this configuration to gemm-config.h? (y/n): ").strip().lower()
if apply == 'y':
self.generate_config(
best['act_parallel'],
@@ -288,17 +232,30 @@ class GemmTuner:
best['parallel_size']
)
self.rebuild_project()
print("✅ Best configuration applied!")
print("✅ Best configuration applied and project rebuilt!")
else:
self.restore_config()
print("✅ Original configuration restored")
# Clean up backup file
if self.backup_path.exists():
self.backup_path.unlink()
print(f"🗑️ Removed backup file: {self.backup_path}")
except KeyboardInterrupt:
print("\n⚠️ Tuning interrupted by user")
self.restore_config()
# Clean up backup file
if self.backup_path.exists():
self.backup_path.unlink()
print(f"🗑️ Removed backup file: {self.backup_path}")
except Exception as e:
print(f"\n❌ Error during tuning: {e}")
self.restore_config()
# Clean up backup file
if self.backup_path.exists():
self.backup_path.unlink()
print(f"🗑️ Removed backup file: {self.backup_path}")
raise
@@ -308,9 +265,9 @@ def generate_configurations():
act_parallel_options = [True]
row_sizes = [2, 4, 8, 16, 32]
col_sizes = [32, 64, 128, 256, 512, 1024]
parallelism_degree = [2, 4, 8]
row_sizes = [2, 4, 8]#[2, 4, 8, 16, 32]
col_sizes = [32, 64]#[32, 64, 128, 256, 512, 1024]
parallelism_degree = [4]
for act_parallel in act_parallel_options:
for row in row_sizes: