mirror of
https://github.com/microsoft/BitNet.git
synced 2026-05-03 11:20:36 +00:00
[modify] some utils test script
This commit is contained in:
@@ -1,76 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Build script for standalone GEMM kernel benchmark
|
||||
|
||||
set -e
|
||||
|
||||
echo "Building GEMM kernel benchmark..."
|
||||
|
||||
# Compiler settings
|
||||
CXX=${CXX:-g++}
|
||||
BUILD_DIR="../build"
|
||||
SRC_DIR="../src"
|
||||
|
||||
# Create build directory if it doesn't exist
|
||||
mkdir -p ${BUILD_DIR}
|
||||
|
||||
# Compiler flags
|
||||
CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
|
||||
CXXFLAGS+=" -I.. -I../include"
|
||||
CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/include"
|
||||
CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/src"
|
||||
CXXFLAGS+=" -I../3rdparty/llama.cpp/include"
|
||||
CXXFLAGS+=" -DNDEBUG -ffast-math"
|
||||
|
||||
# Link flags
|
||||
LDFLAGS="-lm -lpthread"
|
||||
|
||||
# Link with pre-built libraries
|
||||
GGML_LIB_DIR="../build/3rdparty/llama.cpp/ggml/src"
|
||||
GGML_SO="${GGML_LIB_DIR}/libggml.so"
|
||||
|
||||
if [ ! -f "${GGML_SO}" ]; then
|
||||
echo "⚠️ Warning: Cannot find libggml.so"
|
||||
echo "Please build the project first with: cmake --build build"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,\$ORIGIN/../../${GGML_LIB_DIR}"
|
||||
echo "Linking with libggml.so"
|
||||
|
||||
# Source files
|
||||
SOURCES="./test_gemm_kernel.cpp"
|
||||
|
||||
# Output binary
|
||||
OUTPUT="${BUILD_DIR}/test_gemm_kernel"
|
||||
|
||||
echo "Compiler: ${CXX}"
|
||||
echo "Flags: ${CXXFLAGS}"
|
||||
echo "Sources: ${SOURCES}"
|
||||
echo ""
|
||||
|
||||
# Build
|
||||
${CXX} ${CXXFLAGS} ${SOURCES} -o ${OUTPUT} ${LDFLAGS}
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "✅ Build successful!"
|
||||
echo "Output: ${OUTPUT}"
|
||||
echo ""
|
||||
echo "Usage examples:"
|
||||
echo " # Default test (n=2048, nr=32, nc=128, 1000 iterations)"
|
||||
echo " ${OUTPUT}"
|
||||
echo ""
|
||||
echo " # Custom matrix sizes"
|
||||
echo " ${OUTPUT} -n 4096 -r 64 -c 256"
|
||||
echo ""
|
||||
echo " # Quick test (fewer iterations)"
|
||||
echo " ${OUTPUT} -i 100 -w 5"
|
||||
echo ""
|
||||
echo " # Large-scale test"
|
||||
echo " ${OUTPUT} -n 3200 -r 128 -c 512 -i 500"
|
||||
echo ""
|
||||
else
|
||||
echo ""
|
||||
echo "❌ Build failed!"
|
||||
exit 1
|
||||
fi
|
||||
Regular → Executable
+302
-3
@@ -1,3 +1,110 @@
|
||||
#!/bin/bash
|
||||
# Unified GEMM kernel benchmark script
|
||||
# Builds, tests, and benchmarks the GEMM kernel with configurable output
|
||||
|
||||
set -e
|
||||
|
||||
# Default values
|
||||
BUILD_DIR="../build"
|
||||
ITERATIONS=1000
|
||||
OUTPUT_CSV=""
|
||||
SKIP_BUILD=false
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Print usage
|
||||
print_usage() {
|
||||
cat << EOF
|
||||
Usage: $0 [options]
|
||||
|
||||
Options:
|
||||
-o, --output <path> Output CSV file path (default: ../stats/gemm_kernel_test_noparal.csv)
|
||||
-i, --iterations <num> Number of iterations per test (default: 1000)
|
||||
-s, --skip-build Skip building the benchmark binary
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
# Run with default settings
|
||||
$0
|
||||
|
||||
# Specify custom output file
|
||||
$0 -o /path/to/my_results.csv
|
||||
|
||||
# Quick test with fewer iterations
|
||||
$0 -i 100 -o quick_test.csv
|
||||
|
||||
# Skip build if already compiled
|
||||
$0 -s -o results.csv
|
||||
EOF
|
||||
}
|
||||
|
||||
# Parse command line arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-o|--output)
|
||||
OUTPUT_CSV="$2"
|
||||
shift 2
|
||||
;;
|
||||
-i|--iterations)
|
||||
ITERATIONS="$2"
|
||||
shift 2
|
||||
;;
|
||||
-s|--skip-build)
|
||||
SKIP_BUILD=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
print_usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
print_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Set default output CSV if not specified
|
||||
if [ -z "$OUTPUT_CSV" ]; then
|
||||
OUTPUT_CSV="${SCRIPT_DIR}/../stats/gemm_kernel_test_noparal.csv"
|
||||
fi
|
||||
|
||||
# Create output directory first
|
||||
mkdir -p "$(dirname "$OUTPUT_CSV")"
|
||||
|
||||
# Convert to absolute path
|
||||
if [[ "$OUTPUT_CSV" = /* ]]; then
|
||||
# Already absolute path
|
||||
OUTPUT_CSV="$OUTPUT_CSV"
|
||||
else
|
||||
# Convert relative path to absolute
|
||||
OUTPUT_CSV="$(cd "$(dirname "$OUTPUT_CSV")" && pwd)/$(basename "$OUTPUT_CSV")"
|
||||
fi
|
||||
|
||||
echo "=========================================="
|
||||
echo "GEMM Kernel Benchmark Suite"
|
||||
echo "=========================================="
|
||||
echo "Configuration:"
|
||||
echo " Iterations: $ITERATIONS"
|
||||
echo " Output CSV: $OUTPUT_CSV"
|
||||
echo " Skip build: $SKIP_BUILD"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# Build the benchmark binary
|
||||
if [ "$SKIP_BUILD" = false ]; then
|
||||
echo "Step 1: Building GEMM kernel benchmark..."
|
||||
echo "------------------------------------------"
|
||||
|
||||
CXX=${CXX:-g++}
|
||||
|
||||
# Create build directory if it doesn't exist
|
||||
mkdir -p "${SCRIPT_DIR}/${BUILD_DIR}"
|
||||
|
||||
# Create temporary C++ source file
|
||||
TEMP_CPP="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel_temp.cpp"
|
||||
|
||||
cat > "${TEMP_CPP}" << 'EOF'
|
||||
/**
|
||||
* Standalone benchmark for ggml_gemm_i2_i8_s kernel
|
||||
*
|
||||
@@ -131,13 +238,20 @@ void run_benchmark(const BenchmarkConfig& config) {
|
||||
printf("Allocating matrices...\n");
|
||||
|
||||
// X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes
|
||||
uint8_t* X = (uint8_t*)malloc(config.nc * config.n / 4);
|
||||
// Align to 64 bytes for AVX-512, which is backward compatible with AVX2 (32 bytes)
|
||||
size_t x_size = config.nc * config.n / 4;
|
||||
size_t x_size_aligned = ((x_size + 63) / 64) * 64;
|
||||
uint8_t* X = (uint8_t*)aligned_alloc(64, x_size_aligned);
|
||||
|
||||
// Y matrix (i8 format): nr x n
|
||||
int8_t* Y = (int8_t*)malloc(config.nr * config.n);
|
||||
size_t y_size = config.nr * config.n;
|
||||
size_t y_size_aligned = ((y_size + 63) / 64) * 64;
|
||||
int8_t* Y = (int8_t*)aligned_alloc(64, y_size_aligned);
|
||||
|
||||
// Result matrix (float32): nr x nc
|
||||
float* S = (float*)malloc(config.nr * config.nc * sizeof(float));
|
||||
size_t s_size = config.nr * config.nc * sizeof(float);
|
||||
size_t s_size_aligned = ((s_size + 63) / 64) * 64;
|
||||
float* S = (float*)aligned_alloc(64, s_size_aligned);
|
||||
|
||||
if (!X || !Y || !S) {
|
||||
fprintf(stderr, "Failed to allocate memory\n");
|
||||
@@ -272,3 +386,188 @@ int main(int argc, char** argv) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
EOF
|
||||
|
||||
# Compiler flags
|
||||
CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
|
||||
CXXFLAGS+=" -I${SCRIPT_DIR}/.. -I${SCRIPT_DIR}/../include"
|
||||
CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/include"
|
||||
CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/src"
|
||||
CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/include"
|
||||
CXXFLAGS+=" -DNDEBUG -ffast-math"
|
||||
|
||||
# Link flags
|
||||
LDFLAGS="-lm -lpthread"
|
||||
|
||||
# Link with pre-built libraries
|
||||
GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
|
||||
GGML_SO="${GGML_LIB_DIR}/libggml.so"
|
||||
|
||||
if [ ! -f "${GGML_SO}" ]; then
|
||||
echo "❌ Error: Cannot find libggml.so at ${GGML_SO}"
|
||||
echo "Please build the project first with: cmake --build build"
|
||||
rm -f "${TEMP_CPP}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,${GGML_LIB_DIR}"
|
||||
|
||||
# Output binary
|
||||
BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"
|
||||
|
||||
echo "Compiler: ${CXX}"
|
||||
echo "Building from embedded source..."
|
||||
echo ""
|
||||
|
||||
# Build
|
||||
${CXX} ${CXXFLAGS} "${TEMP_CPP}" -o ${BENCHMARK_BIN} ${LDFLAGS}
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✅ Build successful!"
|
||||
rm -f "${TEMP_CPP}"
|
||||
echo ""
|
||||
else
|
||||
echo "❌ Build failed!"
|
||||
rm -f "${TEMP_CPP}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Step 1: Skipping build (using existing binary)"
|
||||
echo "------------------------------------------"
|
||||
BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"
|
||||
|
||||
if [ ! -f "${BENCHMARK_BIN}" ]; then
|
||||
echo "❌ Error: Benchmark binary not found at ${BENCHMARK_BIN}"
|
||||
echo "Please run without -s to build it first."
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Found existing binary"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Set LD_LIBRARY_PATH to include the GGML library directory
|
||||
GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
|
||||
export LD_LIBRARY_PATH="${GGML_LIB_DIR}:${LD_LIBRARY_PATH}"
|
||||
|
||||
echo "Step 2: Running benchmark tests"
|
||||
echo "------------------------------------------"
|
||||
echo "Library path: ${GGML_LIB_DIR}"
|
||||
echo ""
|
||||
|
||||
# Write CSV header
|
||||
echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$OUTPUT_CSV"
|
||||
echo "Results will be saved to: $OUTPUT_CSV"
|
||||
echo ""
|
||||
|
||||
# Function to extract metrics and append to CSV
|
||||
extract_and_save() {
|
||||
local test_name="$1"
|
||||
local output="$2"
|
||||
|
||||
# Extract values using grep and awk
|
||||
local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
|
||||
local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
|
||||
local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
|
||||
local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
|
||||
local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
|
||||
local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
|
||||
local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
|
||||
local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
|
||||
|
||||
# Check if values were extracted successfully
|
||||
if [ -z "$avg_time" ] || [ -z "$min_time" ] || [ -z "$max_time" ]; then
|
||||
echo "Warning: Failed to extract timing data for ${test_name}"
|
||||
echo "${test_name},${n},${nr},${nc},N/A,N/A,N/A" >> "$OUTPUT_CSV"
|
||||
return
|
||||
fi
|
||||
|
||||
# Calculate standard deviation estimate from range
|
||||
# Using awk with proper variable passing
|
||||
local std_time=$(awk -v min="$min_time" -v max="$max_time" 'BEGIN {printf "%.4f", (max - min) / 4}')
|
||||
|
||||
# Format as mean±std
|
||||
local time_formatted="${avg_time}±${std_time}"
|
||||
|
||||
# Append to CSV
|
||||
echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$OUTPUT_CSV"
|
||||
}
|
||||
|
||||
# Run benchmark tests
|
||||
echo "=========================================="
|
||||
echo "BitNet-2B Typical Shapes Performance Test"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
echo "Test 1: Single Token Generation (Attention QKV projection)"
|
||||
echo " Scenario: Generating 1 token at a time"
|
||||
echo " Shape: n=2048, r=1, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "single_token_gen" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
|
||||
echo " Scenario: Processing prompt with 128 tokens, batch size 1"
|
||||
echo " Shape: n=2048, r=128, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "small_batch_prompt" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
|
||||
echo " Scenario: Processing prompt with 256 tokens or batch of 256"
|
||||
echo " Shape: n=2048, r=256, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "medium_batch_prompt" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 4: Large Batch Processing (Attention QKV projection)"
|
||||
echo " Scenario: Processing 512 tokens or batch of 512"
|
||||
echo " Shape: n=2048, r=512, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "large_batch_prompt" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 5: FFN Up-projection (Small batch)"
|
||||
echo " Scenario: Feed-forward network expansion, 128 tokens"
|
||||
echo " Shape: n=2048, r=128, c=8192"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "ffn_up_projection" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 6: FFN Down-projection (Small batch)"
|
||||
echo " Scenario: Feed-forward network reduction, 128 tokens"
|
||||
echo " Shape: n=8192, r=128, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "ffn_down_projection" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 7: Long Context Processing"
|
||||
echo " Scenario: Processing very long context (2048 tokens)"
|
||||
echo " Shape: n=2048, r=2048, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "long_context" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 8: Batched Token Generation"
|
||||
echo " Scenario: Generating tokens for 32 sequences simultaneously"
|
||||
echo " Shape: n=2048, r=32, c=2048"
|
||||
OUTPUT=$($BENCHMARK_BIN -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "batched_token_gen" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "All tests completed successfully!"
|
||||
echo "=========================================="
|
||||
echo "Results saved to: $OUTPUT_CSV"
|
||||
echo ""
|
||||
echo "Summary:"
|
||||
wc -l "$OUTPUT_CSV" | awk '{print " Total records:", $1 - 1}'
|
||||
echo " Output file: $OUTPUT_CSV"
|
||||
echo "=========================================="
|
||||
@@ -1,277 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script: Test different GEMM parallel strategy performance
|
||||
# Strategies: weight-parallel and no-parallel
|
||||
# Thread counts: 1,2,4,8,12,16
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
GEMM_CONFIG="$PROJECT_ROOT/include/gemm-config.h"
|
||||
GEMM_CONFIG_BACKUP="$PROJECT_ROOT/include/gemm-config.h.bak"
|
||||
BUILD_DIR="$PROJECT_ROOT/build"
|
||||
STATS_DIR="$PROJECT_ROOT/stats"
|
||||
CSV_FILE="$STATS_DIR/test_parallel_strategy_benchmark.csv"
|
||||
MODEL_PATH="$PROJECT_ROOT/models/BitNet-b1.58-2B-4T/ggml-model-original.gguf"
|
||||
BENCHMARK_CMD="./build/bin/llama-bench"
|
||||
THREADS_LIST="1 2 4 8 12 16"
|
||||
|
||||
# Color output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${GREEN}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Check prerequisites
|
||||
check_prerequisites() {
|
||||
log_info "Checking prerequisites..."
|
||||
|
||||
if [ ! -f "$GEMM_CONFIG" ]; then
|
||||
log_error "gemm-config.h not found: $GEMM_CONFIG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$MODEL_PATH" ]; then
|
||||
log_error "Model file not found: $MODEL_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d "$BUILD_DIR" ]; then
|
||||
log_error "Build directory not found: $BUILD_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$BUILD_DIR/bin/llama-bench" ]; then
|
||||
log_warn "llama-bench executable not found, building..."
|
||||
build_project
|
||||
fi
|
||||
|
||||
if [ ! -d "$STATS_DIR" ]; then
|
||||
log_info "Creating stats directory..."
|
||||
mkdir -p "$STATS_DIR"
|
||||
fi
|
||||
|
||||
log_info "Prerequisites check completed"
|
||||
}
|
||||
|
||||
# Backup original config file
|
||||
backup_config() {
|
||||
log_info "Backing up gemm-config.h..."
|
||||
cp "$GEMM_CONFIG" "$GEMM_CONFIG_BACKUP"
|
||||
log_info "Backup completed: $GEMM_CONFIG_BACKUP"
|
||||
}
|
||||
|
||||
# Restore original config file
|
||||
restore_config() {
|
||||
if [ -f "$GEMM_CONFIG_BACKUP" ]; then
|
||||
log_info "Restoring original gemm-config.h..."
|
||||
cp "$GEMM_CONFIG_BACKUP" "$GEMM_CONFIG"
|
||||
rm "$GEMM_CONFIG_BACKUP"
|
||||
log_info "Restore completed"
|
||||
else
|
||||
log_warn "Backup file not found, skipping restore"
|
||||
fi
|
||||
}
|
||||
|
||||
# Set activation-parallel configuration (keep original ACT_PARALLEL)
|
||||
set_activation_parallel() {
|
||||
log_info "Configuration: activation-parallel (keeping #define ACT_PARALLEL)"
|
||||
log_info "Configuration completed"
|
||||
}
|
||||
|
||||
# Set weight-parallel configuration (remove ACT_PARALLEL)
|
||||
set_weight_parallel() {
|
||||
log_info "Configuration: weight-parallel (removing #define ACT_PARALLEL)"
|
||||
|
||||
# Remove ACT_PARALLEL definition
|
||||
sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
|
||||
|
||||
# Verify modification
|
||||
if grep -q "^#define ACT_PARALLEL" "$GEMM_CONFIG"; then
|
||||
log_error "Failed to remove ACT_PARALLEL"
|
||||
exit 1
|
||||
fi
|
||||
log_info "Configuration completed"
|
||||
}
|
||||
|
||||
# Set no-parallel configuration (remove ACT_PARALLEL + modify SIZE to 1)
|
||||
set_no_parallel() {
|
||||
log_info "Configuration: no-parallel (removing #define ACT_PARALLEL + modifying SIZE to 1)"
|
||||
|
||||
# Remove ACT_PARALLEL definition
|
||||
sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
|
||||
|
||||
# Modify all ROW_BLOCK_SIZE and COL_BLOCK_SIZE to 1
|
||||
sed -i 's/#define ROW_BLOCK_SIZE [0-9]\+/#define ROW_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
|
||||
sed -i 's/#define COL_BLOCK_SIZE [0-9]\+/#define COL_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
|
||||
|
||||
log_info "Configuration completed"
|
||||
}
|
||||
|
||||
# Build project
|
||||
build_project() {
|
||||
log_info "Building project..."
|
||||
cd "$PROJECT_ROOT"
|
||||
|
||||
if [ ! -f "$BUILD_DIR/Makefile" ]; then
|
||||
log_info "First build, running cmake..."
|
||||
cmake -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=Release > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
cd "$BUILD_DIR"
|
||||
make -j$(nproc) llama-bench > /dev/null 2>&1
|
||||
|
||||
if [ ! -f "./bin/llama-bench" ]; then
|
||||
log_error "Build failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "Build completed"
|
||||
cd "$PROJECT_ROOT"
|
||||
}
|
||||
|
||||
# Run benchmark test
|
||||
run_benchmark() {
|
||||
local strategy=$1
|
||||
local threads=$2
|
||||
|
||||
cd "$PROJECT_ROOT"
|
||||
|
||||
# Run llama-bench
|
||||
local output=$($BENCHMARK_CMD -m "$MODEL_PATH" -p 128 -n 0 -t "$threads" -ngl 0 2>&1)
|
||||
|
||||
# Extract line containing "pp128"
|
||||
local line=$(echo "$output" | grep "pp128" | tail -1)
|
||||
|
||||
if [ -z "$line" ]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "$line"
|
||||
}
|
||||
|
||||
# Extract throughput value from benchmark output
|
||||
extract_throughput() {
|
||||
local line=$1
|
||||
|
||||
# Remove any leading/trailing whitespace and log messages
|
||||
# The line format is: | model | size | params | backend | threads | test | throughput |
|
||||
# We need to extract the last field which contains the throughput in format "XXX.XX ± YY.YY"
|
||||
local throughput=$(echo "$line" | awk -F'|' '{print $NF}' | xargs | sed 's/\[.*\]//' | xargs)
|
||||
|
||||
echo "$throughput"
|
||||
}
|
||||
|
||||
# Initialize CSV file
|
||||
init_csv() {
|
||||
log_info "Initializing CSV file: $CSV_FILE"
|
||||
|
||||
cat > "$CSV_FILE" << 'EOF'
|
||||
Strategy,Threads,Throughput
|
||||
EOF
|
||||
|
||||
log_info "CSV file created"
|
||||
}
|
||||
|
||||
# Add result to CSV
|
||||
add_to_csv() {
|
||||
local strategy=$1
|
||||
local threads=$2
|
||||
local throughput=$3
|
||||
|
||||
echo "$strategy,$threads,$throughput" >> "$CSV_FILE"
|
||||
}
|
||||
|
||||
# Main function
|
||||
main() {
|
||||
log_info "Starting GEMM parallel strategy benchmark tests"
|
||||
log_info "================================================"
|
||||
|
||||
# Check prerequisites
|
||||
check_prerequisites
|
||||
|
||||
# Backup original configuration
|
||||
backup_config
|
||||
|
||||
# Initialize CSV file
|
||||
init_csv
|
||||
|
||||
# Define strategies to test
|
||||
local strategies=("activation-parallel" "weight-parallel" "no-parallel")
|
||||
|
||||
for strategy in "${strategies[@]}"; do
|
||||
log_info "================================================"
|
||||
log_info "Testing strategy: $strategy"
|
||||
log_info "================================================"
|
||||
|
||||
# Restore to original configuration
|
||||
restore_config
|
||||
backup_config
|
||||
|
||||
# Apply configuration based on strategy
|
||||
case $strategy in
|
||||
activation-parallel)
|
||||
set_activation_parallel
|
||||
;;
|
||||
weight-parallel)
|
||||
set_weight_parallel
|
||||
;;
|
||||
no-parallel)
|
||||
set_no_parallel
|
||||
;;
|
||||
esac
|
||||
|
||||
# Rebuild project to apply new configuration
|
||||
log_info "Rebuilding project to apply new configuration..."
|
||||
build_project
|
||||
|
||||
# Run test for each thread count
|
||||
for threads in $THREADS_LIST; do
|
||||
log_info ""
|
||||
log_info "Strategy: $strategy, Threads: $threads"
|
||||
|
||||
# Run test (capture only output, not log messages)
|
||||
local result=$(run_benchmark "$strategy" "$threads")
|
||||
local test_status=$?
|
||||
|
||||
if [ $test_status -eq 0 ]; then
|
||||
# Extract throughput value from the result line
|
||||
local throughput=$(extract_throughput "$result")
|
||||
log_info "Throughput: $throughput"
|
||||
|
||||
# Add to CSV
|
||||
add_to_csv "$strategy" "$threads" "$throughput"
|
||||
else
|
||||
log_warn "Test failed for strategy $strategy, threads $threads"
|
||||
fi
|
||||
|
||||
sleep 2 # Give system time to cool down
|
||||
done
|
||||
done
|
||||
|
||||
# Restore original configuration
|
||||
restore_config
|
||||
|
||||
log_info "================================================"
|
||||
log_info "Test completed!"
|
||||
log_info "Results saved to: $CSV_FILE"
|
||||
log_info "================================================"
|
||||
|
||||
# Display CSV content
|
||||
log_info "CSV file content:"
|
||||
cat "$CSV_FILE"
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
@@ -1,120 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Test typical matrix shapes for BitNet-2B model
|
||||
# Based on BitNet-b1.58-2B-4T architecture
|
||||
|
||||
echo "=========================================="
|
||||
echo "BitNet-2B Typical Shapes Performance Test"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
ITERATIONS=1000
|
||||
BENCHMARK="../build/test_gemm_kernel"
|
||||
|
||||
# Create stats directory if not exists
|
||||
mkdir -p ../stats
|
||||
|
||||
# Generate output CSV filename
|
||||
CSV_FILE="../stats/gemm_kernel_test_noparal.csv"
|
||||
|
||||
# Write CSV header
|
||||
echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$CSV_FILE"
|
||||
echo "Results will be saved to: $CSV_FILE"
|
||||
echo ""
|
||||
|
||||
# Function to extract metrics and append to CSV
|
||||
extract_and_save() {
|
||||
local test_name="$1"
|
||||
local output="$2"
|
||||
|
||||
# Extract values using grep and awk
|
||||
local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
|
||||
local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
|
||||
local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
|
||||
local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
|
||||
local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
|
||||
local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
|
||||
local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
|
||||
local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
|
||||
|
||||
# Calculate standard deviation estimate from range (assuming ~95% of data within min-max)
|
||||
# For normal distribution, range ≈ 4*std, so std ≈ range/4
|
||||
local std_time=$(echo "scale=4; ($max_time - $min_time) / 4" | bc)
|
||||
|
||||
# Format as mean±std
|
||||
local time_formatted="${avg_time}±${std_time}"
|
||||
|
||||
# For GFLOPS and throughput, we don't have std info, so just use the value
|
||||
# If you want to estimate std for these as well, you would need more data
|
||||
|
||||
# Append to CSV
|
||||
echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$CSV_FILE"
|
||||
}
|
||||
|
||||
echo "Test 1: Single Token Generation (Attention QKV projection)"
|
||||
echo " Scenario: Generating 1 token at a time"
|
||||
echo " Shape: n=2048, r=1, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "single_token_gen" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
|
||||
echo " Scenario: Processing prompt with 128 tokens, batch size 1"
|
||||
echo " Shape: n=2048, r=128, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "small_batch_prompt" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
|
||||
echo " Scenario: Processing prompt with 256 tokens or batch of 256"
|
||||
echo " Shape: n=2048, r=256, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "medium_batch_prompt" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 4: Large Batch Processing (Attention QKV projection)"
|
||||
echo " Scenario: Processing 512 tokens or batch of 512"
|
||||
echo " Shape: n=2048, r=512, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "large_batch_prompt" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 5: FFN Up-projection (Small batch)"
|
||||
echo " Scenario: Feed-forward network expansion, 128 tokens"
|
||||
echo " Shape: n=2048, r=128, c=8192"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "ffn_up_projection" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 6: FFN Down-projection (Small batch)"
|
||||
echo " Scenario: Feed-forward network reduction, 128 tokens"
|
||||
echo " Shape: n=8192, r=128, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "ffn_down_projection" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 7: Long Context Processing"
|
||||
echo " Scenario: Processing very long context (2048 tokens)"
|
||||
echo " Shape: n=2048, r=2048, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "long_context" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "Test 8: Batched Token Generation"
|
||||
echo " Scenario: Generating tokens for 32 sequences simultaneously"
|
||||
echo " Shape: n=2048, r=32, c=2048"
|
||||
OUTPUT=$($BENCHMARK -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
|
||||
echo "$OUTPUT"
|
||||
extract_and_save "batched_token_gen" "$OUTPUT"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "All tests completed!"
|
||||
echo "Results saved to: $CSV_FILE"
|
||||
echo "=========================================="
|
||||
+37
-80
@@ -35,85 +35,16 @@ class GemmTuner:
|
||||
shutil.copy2(self.backup_path, self.config_path)
|
||||
|
||||
def generate_config(self, act_parallel, row_block_size, col_block_size, parallel_size):
|
||||
"""Generate new configuration file"""
|
||||
"""Generate new configuration file with simplified format"""
|
||||
content = ""
|
||||
|
||||
# ACT_PARALLEL definition
|
||||
# Simplified configuration format
|
||||
if act_parallel:
|
||||
content += "#define ACT_PARALLEL\n"
|
||||
else:
|
||||
content += "// #define ACT_PARALLEL\n"
|
||||
|
||||
# Detect architecture branches in original config file
|
||||
with open(self.backup_path, 'r') as f:
|
||||
original = f.read()
|
||||
|
||||
has_avx = "__AVX__" in original or "__AVX2__" in original
|
||||
has_arm = "__ARM_NEON" in original
|
||||
|
||||
# If architecture detection exists, generate corresponding branches
|
||||
if has_avx and has_arm:
|
||||
# Multi-architecture configuration
|
||||
content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
|
||||
content += "#if defined(ACT_PARALLEL)\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#else\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#endif\n"
|
||||
content += "#elif defined(__ARM_NEON)\n"
|
||||
content += "#if defined(ACT_PARALLEL)\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#else\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#endif\n"
|
||||
content += "#endif\n"
|
||||
elif has_avx:
|
||||
# AVX architecture only
|
||||
content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
|
||||
content += "#if defined(ACT_PARALLEL)\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#else\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#endif\n"
|
||||
content += "#endif\n"
|
||||
elif has_arm:
|
||||
# ARM architecture only
|
||||
content += "#if defined(__ARM_NEON)\n"
|
||||
content += "#if defined(ACT_PARALLEL)\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#else\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#endif\n"
|
||||
content += "#endif\n"
|
||||
else:
|
||||
# No architecture detection, define directly
|
||||
content += "#if defined(ACT_PARALLEL)\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#else\n"
|
||||
content += f" #define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f" #define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f" #define PARALLEL_SIZE {parallel_size}\n"
|
||||
content += "#endif\n"
|
||||
|
||||
content += "\n"
|
||||
content += f"#define ROW_BLOCK_SIZE {row_block_size}\n"
|
||||
content += f"#define COL_BLOCK_SIZE {col_block_size}\n"
|
||||
content += f"#define PARALLEL_SIZE {parallel_size}\n"
|
||||
|
||||
with open(self.config_path, 'w') as f:
|
||||
f.write(content)
|
||||
@@ -259,9 +190,12 @@ class GemmTuner:
|
||||
# Save results
|
||||
if output_csv is None:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
csv_path = f"stats/tuning_results_{timestamp}.csv"
|
||||
csv_path = f"../stats/tuning_results_{timestamp}.csv"
|
||||
else:
|
||||
csv_path = output_csv
|
||||
|
||||
# Ensure stats directory exists
|
||||
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
|
||||
self.save_results(csv_path)
|
||||
|
||||
# Find best configuration
|
||||
@@ -278,8 +212,18 @@ class GemmTuner:
|
||||
print(f"PP128 Throughput: {best['pp_throughput']:.2f} ± {best['pp_std_dev']:.2f} t/s")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Show the configuration that will be written
|
||||
print("Configuration to be written to gemm-config.h:")
|
||||
print("-" * 80)
|
||||
if best['act_parallel']:
|
||||
print("#define ACT_PARALLEL")
|
||||
print(f"#define ROW_BLOCK_SIZE {best['row_block_size']}")
|
||||
print(f"#define COL_BLOCK_SIZE {best['col_block_size']}")
|
||||
print(f"#define PARALLEL_SIZE {best['parallel_size']}")
|
||||
print("-" * 80)
|
||||
|
||||
# Apply best configuration
|
||||
apply = input("Do you want to apply this configuration? (y/n): ").strip().lower()
|
||||
apply = input("\nDo you want to apply this configuration to gemm-config.h? (y/n): ").strip().lower()
|
||||
if apply == 'y':
|
||||
self.generate_config(
|
||||
best['act_parallel'],
|
||||
@@ -288,17 +232,30 @@ class GemmTuner:
|
||||
best['parallel_size']
|
||||
)
|
||||
self.rebuild_project()
|
||||
print("✅ Best configuration applied!")
|
||||
print("✅ Best configuration applied and project rebuilt!")
|
||||
else:
|
||||
self.restore_config()
|
||||
print("✅ Original configuration restored")
|
||||
|
||||
# Clean up backup file
|
||||
if self.backup_path.exists():
|
||||
self.backup_path.unlink()
|
||||
print(f"🗑️ Removed backup file: {self.backup_path}")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⚠️ Tuning interrupted by user")
|
||||
self.restore_config()
|
||||
# Clean up backup file
|
||||
if self.backup_path.exists():
|
||||
self.backup_path.unlink()
|
||||
print(f"🗑️ Removed backup file: {self.backup_path}")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error during tuning: {e}")
|
||||
self.restore_config()
|
||||
# Clean up backup file
|
||||
if self.backup_path.exists():
|
||||
self.backup_path.unlink()
|
||||
print(f"🗑️ Removed backup file: {self.backup_path}")
|
||||
raise
|
||||
|
||||
|
||||
@@ -308,9 +265,9 @@ def generate_configurations():
|
||||
|
||||
act_parallel_options = [True]
|
||||
|
||||
row_sizes = [2, 4, 8, 16, 32]
|
||||
col_sizes = [32, 64, 128, 256, 512, 1024]
|
||||
parallelism_degree = [2, 4, 8]
|
||||
row_sizes = [2, 4, 8]#[2, 4, 8, 16, 32]
|
||||
col_sizes = [32, 64]#[32, 64, 128, 256, 512, 1024]
|
||||
parallelism_degree = [4]
|
||||
|
||||
for act_parallel in act_parallel_options:
|
||||
for row in row_sizes:
|
||||
|
||||
Reference in New Issue
Block a user