diff --git a/utils/build_test_gemm_kernel.sh b/utils/build_test_gemm_kernel.sh
deleted file mode 100755
index bc45942..0000000
--- a/utils/build_test_gemm_kernel.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-# Build script for standalone GEMM kernel benchmark
-
-set -e
-
-echo "Building GEMM kernel benchmark..."
-
-# Compiler settings
-CXX=${CXX:-g++}
-BUILD_DIR="../build"
-SRC_DIR="../src"
-
-# Create build directory if it doesn't exist
-mkdir -p ${BUILD_DIR}
-
-# Compiler flags
-CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
-CXXFLAGS+=" -I.. -I../include"
-CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/include"
-CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/src"
-CXXFLAGS+=" -I../3rdparty/llama.cpp/include"
-CXXFLAGS+=" -DNDEBUG -ffast-math"
-
-# Link flags
-LDFLAGS="-lm -lpthread"
-
-# Link with pre-built libraries
-GGML_LIB_DIR="../build/3rdparty/llama.cpp/ggml/src"
-GGML_SO="${GGML_LIB_DIR}/libggml.so"
-
-if [ ! -f "${GGML_SO}" ]; then
-    echo "⚠️  Warning: Cannot find libggml.so"
-    echo "Please build the project first with: cmake --build build"
-    exit 1
-fi
-
-LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,\$ORIGIN/../../${GGML_LIB_DIR}"
-echo "Linking with libggml.so"
-
-# Source files
-SOURCES="./test_gemm_kernel.cpp"
-
-# Output binary
-OUTPUT="${BUILD_DIR}/test_gemm_kernel"
-
-echo "Compiler: ${CXX}"
-echo "Flags: ${CXXFLAGS}"
-echo "Sources: ${SOURCES}"
-echo ""
-
-# Build
-${CXX} ${CXXFLAGS} ${SOURCES} -o ${OUTPUT} ${LDFLAGS}
-
-if [ $? -eq 0 ]; then
-    echo ""
-    echo "✅ Build successful!"
-    echo "Output: ${OUTPUT}"
-    echo ""
-    echo "Usage examples:"
-    echo "  # Default test (n=2048, nr=32, nc=128, 1000 iterations)"
-    echo "  ${OUTPUT}"
-    echo ""
-    echo "  # Custom matrix sizes"
-    echo "  ${OUTPUT} -n 4096 -r 64 -c 256"
-    echo ""
-    echo "  # Quick test (fewer iterations)"
-    echo "  ${OUTPUT} -i 100 -w 5"
-    echo ""
-    echo "  # Large-scale test"
-    echo "  ${OUTPUT} -n 3200 -r 128 -c 512 -i 500"
-    echo ""
-else
-    echo ""
-    echo "❌ Build failed!"
-    exit 1
-fi
diff --git a/utils/kernel_tuning.py b/utils/kernel_tuning.py
deleted file mode 100644
index e69de29..0000000
diff --git a/utils/test_gemm_kernel.cpp b/utils/test_gemm_kernel.sh
old mode 100644
new mode 100755
similarity index 51%
rename from utils/test_gemm_kernel.cpp
rename to utils/test_gemm_kernel.sh
index 36964ce..ae72c72
--- a/utils/test_gemm_kernel.cpp
+++ b/utils/test_gemm_kernel.sh
@@ -1,3 +1,110 @@
+#!/bin/bash
+# Unified GEMM kernel benchmark script
+# Builds, tests, and benchmarks the GEMM kernel with configurable output
+
+set -e
+
+# Default values
+BUILD_DIR="../build"
+ITERATIONS=1000
+OUTPUT_CSV=""
+SKIP_BUILD=false
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Print usage
+print_usage() {
+    cat << EOF
+Usage: $0 [options]
+
+Options:
+  -o, --output <path>     Output CSV file path (default: ../stats/gemm_kernel_test_noparal.csv)
+  -i, --iterations <num>  Number of iterations per test (default: 1000)
+  -s, --skip-build        Skip building the benchmark binary
+  -h, --help              Show this help message
+
+Examples:
+  # Run with default settings
+  $0
+
+  # Specify custom output file
+  $0 -o /path/to/my_results.csv
+
+  # Quick test with fewer iterations
+  $0 -i 100 -o quick_test.csv
+
+  # Skip build if already compiled
+  $0 -s -o results.csv
+EOF
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -o|--output)
+            OUTPUT_CSV="$2"
+            shift 2
+            ;;
+        -i|--iterations)
+            ITERATIONS="$2"
+            shift 2
+            ;;
+        -s|--skip-build)
+            SKIP_BUILD=true
+            shift
+            ;;
+        -h|--help)
+            print_usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
+# Set default output CSV if not specified
+if [ -z "$OUTPUT_CSV" ]; then
+    OUTPUT_CSV="${SCRIPT_DIR}/../stats/gemm_kernel_test_noparal.csv"
+fi
+
+# Create output directory first
+mkdir -p "$(dirname "$OUTPUT_CSV")"
+
+# Convert to absolute path
+if [[ "$OUTPUT_CSV" = /* ]]; then
+    # Already absolute path
+    OUTPUT_CSV="$OUTPUT_CSV"
+else
+    # Convert relative path to absolute
+    OUTPUT_CSV="$(cd "$(dirname "$OUTPUT_CSV")" && pwd)/$(basename "$OUTPUT_CSV")"
+fi
+
+echo "=========================================="
+echo "GEMM Kernel Benchmark Suite"
+echo "=========================================="
+echo "Configuration:"
+echo "  Iterations: $ITERATIONS"
+echo "  Output CSV: $OUTPUT_CSV"
+echo "  Skip build: $SKIP_BUILD"
+echo "=========================================="
+echo ""
+
+# Build the benchmark binary
+if [ "$SKIP_BUILD" = false ]; then
+    echo "Step 1: Building GEMM kernel benchmark..."
+    echo "------------------------------------------"
+    
+    CXX=${CXX:-g++}
+    
+    # Create build directory if it doesn't exist
+    mkdir -p "${SCRIPT_DIR}/${BUILD_DIR}"
+    
+    # Create temporary C++ source file
+    TEMP_CPP="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel_temp.cpp"
+    
+    cat > "${TEMP_CPP}" << 'EOF'
 /**
  * Standalone benchmark for ggml_gemm_i2_i8_s kernel
  * 
@@ -131,13 +238,20 @@ void run_benchmark(const BenchmarkConfig& config) {
     printf("Allocating matrices...\n");
     
     // X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes
-    uint8_t* X = (uint8_t*)malloc(config.nc * config.n / 4);
+    // Align to 64 bytes for AVX-512, which is backward compatible with AVX2 (32 bytes)
+    size_t x_size = config.nc * config.n / 4;
+    size_t x_size_aligned = ((x_size + 63) / 64) * 64;
+    uint8_t* X = (uint8_t*)aligned_alloc(64, x_size_aligned);
     
     // Y matrix (i8 format): nr x n
-    int8_t* Y = (int8_t*)malloc(config.nr * config.n);
+    size_t y_size = config.nr * config.n;
+    size_t y_size_aligned = ((y_size + 63) / 64) * 64;
+    int8_t* Y = (int8_t*)aligned_alloc(64, y_size_aligned);
     
     // Result matrix (float32): nr x nc
-    float* S = (float*)malloc(config.nr * config.nc * sizeof(float));
+    size_t s_size = config.nr * config.nc * sizeof(float);
+    size_t s_size_aligned = ((s_size + 63) / 64) * 64;
+    float* S = (float*)aligned_alloc(64, s_size_aligned);
     
     if (!X || !Y || !S) {
         fprintf(stderr, "Failed to allocate memory\n");
@@ -272,3 +386,188 @@ int main(int argc, char** argv) {
     
     return 0;
 }
+EOF
+    
+    # Compiler flags
+    CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
+    CXXFLAGS+=" -I${SCRIPT_DIR}/.. -I${SCRIPT_DIR}/../include"
+    CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/include"
+    CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/ggml/src"
+    CXXFLAGS+=" -I${SCRIPT_DIR}/../3rdparty/llama.cpp/include"
+    CXXFLAGS+=" -DNDEBUG -ffast-math"
+    
+    # Link flags
+    LDFLAGS="-lm -lpthread"
+    
+    # Link with pre-built libraries
+    GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
+    GGML_SO="${GGML_LIB_DIR}/libggml.so"
+    
+    if [ ! -f "${GGML_SO}" ]; then
+        echo "❌ Error: Cannot find libggml.so at ${GGML_SO}"
+        echo "Please build the project first with: cmake --build build"
+        rm -f "${TEMP_CPP}"
+        exit 1
+    fi
+    
+    LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,${GGML_LIB_DIR}"
+    
+    # Output binary
+    BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"
+    
+    echo "Compiler: ${CXX}"
+    echo "Building from embedded source..."
+    echo ""
+    
+    # Build
+    ${CXX} ${CXXFLAGS} "${TEMP_CPP}" -o ${BENCHMARK_BIN} ${LDFLAGS}
+    
+    if [ $? -eq 0 ]; then
+        echo "✅ Build successful!"
+        rm -f "${TEMP_CPP}"
+        echo ""
+    else
+        echo "❌ Build failed!"
+        rm -f "${TEMP_CPP}"
+        exit 1
+    fi
+else
+    echo "Step 1: Skipping build (using existing binary)"
+    echo "------------------------------------------"
+    BENCHMARK_BIN="${SCRIPT_DIR}/${BUILD_DIR}/test_gemm_kernel"
+    
+    if [ ! -f "${BENCHMARK_BIN}" ]; then
+        echo "❌ Error: Benchmark binary not found at ${BENCHMARK_BIN}"
+        echo "Please run without -s to build it first."
+        exit 1
+    fi
+    echo "✅ Found existing binary"
+    echo ""
+fi
+
+# Set LD_LIBRARY_PATH to include the GGML library directory
+GGML_LIB_DIR="${SCRIPT_DIR}/../build/3rdparty/llama.cpp/ggml/src"
+export LD_LIBRARY_PATH="${GGML_LIB_DIR}:${LD_LIBRARY_PATH}"
+
+echo "Step 2: Running benchmark tests"
+echo "------------------------------------------"
+echo "Library path: ${GGML_LIB_DIR}"
+echo ""
+
+# Write CSV header
+echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$OUTPUT_CSV"
+echo "Results will be saved to: $OUTPUT_CSV"
+echo ""
+
+# Function to extract metrics and append to CSV
+extract_and_save() {
+    local test_name="$1"
+    local output="$2"
+    
+    # Extract values using grep and awk
+    local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
+    local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
+    local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
+    local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
+    local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
+    local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
+    local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
+    local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
+    
+    # Check if values were extracted successfully
+    if [ -z "$avg_time" ] || [ -z "$min_time" ] || [ -z "$max_time" ]; then
+        echo "Warning: Failed to extract timing data for ${test_name}"
+        echo "${test_name},${n},${nr},${nc},N/A,N/A,N/A" >> "$OUTPUT_CSV"
+        return
+    fi
+    
+    # Calculate standard deviation estimate from range
+    # Using awk with proper variable passing
+    local std_time=$(awk -v min="$min_time" -v max="$max_time" 'BEGIN {printf "%.4f", (max - min) / 4}')
+    
+    # Format as mean±std
+    local time_formatted="${avg_time}±${std_time}"
+    
+    # Append to CSV
+    echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$OUTPUT_CSV"
+}
+
+# Run benchmark tests
+echo "=========================================="
+echo "BitNet-2B Typical Shapes Performance Test"
+echo "=========================================="
+echo ""
+
+echo "Test 1: Single Token Generation (Attention QKV projection)"
+echo "  Scenario: Generating 1 token at a time"
+echo "  Shape: n=2048, r=1, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "single_token_gen" "$OUTPUT"
+echo ""
+
+echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
+echo "  Scenario: Processing prompt with 128 tokens, batch size 1"
+echo "  Shape: n=2048, r=128, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "small_batch_prompt" "$OUTPUT"
+echo ""
+
+echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
+echo "  Scenario: Processing prompt with 256 tokens or batch of 256"
+echo "  Shape: n=2048, r=256, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "medium_batch_prompt" "$OUTPUT"
+echo ""
+
+echo "Test 4: Large Batch Processing (Attention QKV projection)"
+echo "  Scenario: Processing 512 tokens or batch of 512"
+echo "  Shape: n=2048, r=512, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "large_batch_prompt" "$OUTPUT"
+echo ""
+
+echo "Test 5: FFN Up-projection (Small batch)"
+echo "  Scenario: Feed-forward network expansion, 128 tokens"
+echo "  Shape: n=2048, r=128, c=8192"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "ffn_up_projection" "$OUTPUT"
+echo ""
+
+echo "Test 6: FFN Down-projection (Small batch)"
+echo "  Scenario: Feed-forward network reduction, 128 tokens"
+echo "  Shape: n=8192, r=128, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "ffn_down_projection" "$OUTPUT"
+echo ""
+
+echo "Test 7: Long Context Processing"
+echo "  Scenario: Processing very long context (2048 tokens)"
+echo "  Shape: n=2048, r=2048, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "long_context" "$OUTPUT"
+echo ""
+
+echo "Test 8: Batched Token Generation"
+echo "  Scenario: Generating tokens for 32 sequences simultaneously"
+echo "  Shape: n=2048, r=32, c=2048"
+OUTPUT=$($BENCHMARK_BIN -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "batched_token_gen" "$OUTPUT"
+echo ""
+
+echo "=========================================="
+echo "All tests completed successfully!"
+echo "=========================================="
+echo "Results saved to: $OUTPUT_CSV"
+echo ""
+echo "Summary:"
+wc -l "$OUTPUT_CSV" | awk '{print "  Total records:", $1 - 1}'
+echo "  Output file: $OUTPUT_CSV"
+echo "=========================================="
diff --git a/utils/test_parallel_strategy.sh b/utils/test_parallel_strategy.sh
deleted file mode 100755
index 44da140..0000000
--- a/utils/test_parallel_strategy.sh
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/bin/bash
-
-# Script: Test different GEMM parallel strategy performance
-# Strategies: weight-parallel and no-parallel
-# Thread counts: 1,2,4,8,12,16
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
-GEMM_CONFIG="$PROJECT_ROOT/include/gemm-config.h"
-GEMM_CONFIG_BACKUP="$PROJECT_ROOT/include/gemm-config.h.bak"
-BUILD_DIR="$PROJECT_ROOT/build"
-STATS_DIR="$PROJECT_ROOT/stats"
-CSV_FILE="$STATS_DIR/test_parallel_strategy_benchmark.csv"
-MODEL_PATH="$PROJECT_ROOT/models/BitNet-b1.58-2B-4T/ggml-model-original.gguf"
-BENCHMARK_CMD="./build/bin/llama-bench"
-THREADS_LIST="1 2 4 8 12 16"
-
-# Color output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-log_info() {
-    echo -e "${GREEN}[INFO]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-# Check prerequisites
-check_prerequisites() {
-    log_info "Checking prerequisites..."
-    
-    if [ ! -f "$GEMM_CONFIG" ]; then
-        log_error "gemm-config.h not found: $GEMM_CONFIG"
-        exit 1
-    fi
-    
-    if [ ! -f "$MODEL_PATH" ]; then
-        log_error "Model file not found: $MODEL_PATH"
-        exit 1
-    fi
-    
-    if [ ! -d "$BUILD_DIR" ]; then
-        log_error "Build directory not found: $BUILD_DIR"
-        exit 1
-    fi
-    
-    if [ ! -f "$BUILD_DIR/bin/llama-bench" ]; then
-        log_warn "llama-bench executable not found, building..."
-        build_project
-    fi
-    
-    if [ ! -d "$STATS_DIR" ]; then
-        log_info "Creating stats directory..."
-        mkdir -p "$STATS_DIR"
-    fi
-    
-    log_info "Prerequisites check completed"
-}
-
-# Backup original config file
-backup_config() {
-    log_info "Backing up gemm-config.h..."
-    cp "$GEMM_CONFIG" "$GEMM_CONFIG_BACKUP"
-    log_info "Backup completed: $GEMM_CONFIG_BACKUP"
-}
-
-# Restore original config file
-restore_config() {
-    if [ -f "$GEMM_CONFIG_BACKUP" ]; then
-        log_info "Restoring original gemm-config.h..."
-        cp "$GEMM_CONFIG_BACKUP" "$GEMM_CONFIG"
-        rm "$GEMM_CONFIG_BACKUP"
-        log_info "Restore completed"
-    else
-        log_warn "Backup file not found, skipping restore"
-    fi
-}
-
-# Set activation-parallel configuration (keep original ACT_PARALLEL)
-set_activation_parallel() {
-    log_info "Configuration: activation-parallel (keeping #define ACT_PARALLEL)"
-    log_info "Configuration completed"
-}
-
-# Set weight-parallel configuration (remove ACT_PARALLEL)
-set_weight_parallel() {
-    log_info "Configuration: weight-parallel (removing #define ACT_PARALLEL)"
-    
-    # Remove ACT_PARALLEL definition
-    sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
-    
-    # Verify modification
-    if grep -q "^#define ACT_PARALLEL" "$GEMM_CONFIG"; then
-        log_error "Failed to remove ACT_PARALLEL"
-        exit 1
-    fi
-    log_info "Configuration completed"
-}
-
-# Set no-parallel configuration (remove ACT_PARALLEL + modify SIZE to 1)
-set_no_parallel() {
-    log_info "Configuration: no-parallel (removing #define ACT_PARALLEL + modifying SIZE to 1)"
-    
-    # Remove ACT_PARALLEL definition
-    sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
-    
-    # Modify all ROW_BLOCK_SIZE and COL_BLOCK_SIZE to 1
-    sed -i 's/#define ROW_BLOCK_SIZE [0-9]\+/#define ROW_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
-    sed -i 's/#define COL_BLOCK_SIZE [0-9]\+/#define COL_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
-    
-    log_info "Configuration completed"
-}
-
-# Build project
-build_project() {
-    log_info "Building project..."
-    cd "$PROJECT_ROOT"
-    
-    if [ ! -f "$BUILD_DIR/Makefile" ]; then
-        log_info "First build, running cmake..."
-        cmake -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=Release > /dev/null 2>&1
-    fi
-    
-    cd "$BUILD_DIR"
-    make -j$(nproc) llama-bench > /dev/null 2>&1
-    
-    if [ ! -f "./bin/llama-bench" ]; then
-        log_error "Build failed"
-        exit 1
-    fi
-    
-    log_info "Build completed"
-    cd "$PROJECT_ROOT"
-}
-
-# Run benchmark test
-run_benchmark() {
-    local strategy=$1
-    local threads=$2
-    
-    cd "$PROJECT_ROOT"
-    
-    # Run llama-bench
-    local output=$($BENCHMARK_CMD -m "$MODEL_PATH" -p 128 -n 0 -t "$threads" -ngl 0 2>&1)
-    
-    # Extract line containing "pp128"
-    local line=$(echo "$output" | grep "pp128" | tail -1)
-    
-    if [ -z "$line" ]; then
-        return 1
-    fi
-    
-    echo "$line"
-}
-
-# Extract throughput value from benchmark output
-extract_throughput() {
-    local line=$1
-    
-    # Remove any leading/trailing whitespace and log messages
-    # The line format is: | model | size | params | backend | threads | test | throughput |
-    # We need to extract the last field which contains the throughput in format "XXX.XX ± YY.YY"
-    local throughput=$(echo "$line" | awk -F'|' '{print $NF}' | xargs | sed 's/\[.*\]//' | xargs)
-    
-    echo "$throughput"
-}
-
-# Initialize CSV file
-init_csv() {
-    log_info "Initializing CSV file: $CSV_FILE"
-    
-    cat > "$CSV_FILE" << 'EOF'
-Strategy,Threads,Throughput
-EOF
-    
-    log_info "CSV file created"
-}
-
-# Add result to CSV
-add_to_csv() {
-    local strategy=$1
-    local threads=$2
-    local throughput=$3
-    
-    echo "$strategy,$threads,$throughput" >> "$CSV_FILE"
-}
-
-# Main function
-main() {
-    log_info "Starting GEMM parallel strategy benchmark tests"
-    log_info "================================================"
-    
-    # Check prerequisites
-    check_prerequisites
-    
-    # Backup original configuration
-    backup_config
-    
-    # Initialize CSV file
-    init_csv
-    
-    # Define strategies to test
-    local strategies=("activation-parallel" "weight-parallel" "no-parallel")
-    
-    for strategy in "${strategies[@]}"; do
-        log_info "================================================"
-        log_info "Testing strategy: $strategy"
-        log_info "================================================"
-        
-        # Restore to original configuration
-        restore_config
-        backup_config
-        
-        # Apply configuration based on strategy
-        case $strategy in
-            activation-parallel)
-                set_activation_parallel
-                ;;
-            weight-parallel)
-                set_weight_parallel
-                ;;
-            no-parallel)
-                set_no_parallel
-                ;;
-        esac
-        
-        # Rebuild project to apply new configuration
-        log_info "Rebuilding project to apply new configuration..."
-        build_project
-        
-        # Run test for each thread count
-        for threads in $THREADS_LIST; do
-            log_info ""
-            log_info "Strategy: $strategy, Threads: $threads"
-            
-            # Run test (capture only output, not log messages)
-            local result=$(run_benchmark "$strategy" "$threads")
-            local test_status=$?
-            
-            if [ $test_status -eq 0 ]; then
-                # Extract throughput value from the result line
-                local throughput=$(extract_throughput "$result")
-                log_info "Throughput: $throughput"
-                
-                # Add to CSV
-                add_to_csv "$strategy" "$threads" "$throughput"
-            else
-                log_warn "Test failed for strategy $strategy, threads $threads"
-            fi
-            
-            sleep 2  # Give system time to cool down
-        done
-    done
-    
-    # Restore original configuration
-    restore_config
-    
-    log_info "================================================"
-    log_info "Test completed!"
-    log_info "Results saved to: $CSV_FILE"
-    log_info "================================================"
-    
-    # Display CSV content
-    log_info "CSV file content:"
-    cat "$CSV_FILE"
-}
-
-# Run main function
-main "$@"
diff --git a/utils/test_typical_shapes.sh b/utils/test_typical_shapes.sh
deleted file mode 100755
index 6ad805c..0000000
--- a/utils/test_typical_shapes.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/bash
-# Test typical matrix shapes for BitNet-2B model
-# Based on BitNet-b1.58-2B-4T architecture
-
-echo "=========================================="
-echo "BitNet-2B Typical Shapes Performance Test"
-echo "=========================================="
-echo ""
-
-ITERATIONS=1000
-BENCHMARK="../build/test_gemm_kernel"
-
-# Create stats directory if not exists
-mkdir -p ../stats
-
-# Generate output CSV filename
-CSV_FILE="../stats/gemm_kernel_test_noparal.csv"
-
-# Write CSV header
-echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$CSV_FILE"
-echo "Results will be saved to: $CSV_FILE"
-echo ""
-
-# Function to extract metrics and append to CSV
-extract_and_save() {
-    local test_name="$1"
-    local output="$2"
-    
-    # Extract values using grep and awk
-    local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
-    local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
-    local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
-    local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
-    local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
-    local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
-    local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
-    local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
-    
-    # Calculate standard deviation estimate from range (assuming ~95% of data within min-max)
-    # For normal distribution, range ≈ 4*std, so std ≈ range/4
-    local std_time=$(echo "scale=4; ($max_time - $min_time) / 4" | bc)
-    
-    # Format as mean±std
-    local time_formatted="${avg_time}±${std_time}"
-    
-    # For GFLOPS and throughput, we don't have std info, so just use the value
-    # If you want to estimate std for these as well, you would need more data
-    
-    # Append to CSV
-    echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$CSV_FILE"
-}
-
-echo "Test 1: Single Token Generation (Attention QKV projection)"
-echo "  Scenario: Generating 1 token at a time"
-echo "  Shape: n=2048, r=1, c=2048"
-OUTPUT=$($BENCHMARK -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "single_token_gen" "$OUTPUT"
-echo ""
-
-echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
-echo "  Scenario: Processing prompt with 128 tokens, batch size 1"
-echo "  Shape: n=2048, r=128, c=2048"
-OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "small_batch_prompt" "$OUTPUT"
-echo ""
-
-echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
-echo "  Scenario: Processing prompt with 256 tokens or batch of 256"
-echo "  Shape: n=2048, r=256, c=2048"
-OUTPUT=$($BENCHMARK -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "medium_batch_prompt" "$OUTPUT"
-echo ""
-
-echo "Test 4: Large Batch Processing (Attention QKV projection)"
-echo "  Scenario: Processing 512 tokens or batch of 512"
-echo "  Shape: n=2048, r=512, c=2048"
-OUTPUT=$($BENCHMARK -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "large_batch_prompt" "$OUTPUT"
-echo ""
-
-echo "Test 5: FFN Up-projection (Small batch)"
-echo "  Scenario: Feed-forward network expansion, 128 tokens"
-echo "  Shape: n=2048, r=128, c=8192"
-OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "ffn_up_projection" "$OUTPUT"
-echo ""
-
-echo "Test 6: FFN Down-projection (Small batch)"
-echo "  Scenario: Feed-forward network reduction, 128 tokens"
-echo "  Shape: n=8192, r=128, c=2048"
-OUTPUT=$($BENCHMARK -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "ffn_down_projection" "$OUTPUT"
-echo ""
-
-echo "Test 7: Long Context Processing"
-echo "  Scenario: Processing very long context (2048 tokens)"
-echo "  Shape: n=2048, r=2048, c=2048"
-OUTPUT=$($BENCHMARK -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "long_context" "$OUTPUT"
-echo ""
-
-echo "Test 8: Batched Token Generation"
-echo "  Scenario: Generating tokens for 32 sequences simultaneously"
-echo "  Shape: n=2048, r=32, c=2048"
-OUTPUT=$($BENCHMARK -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
-echo "$OUTPUT"
-extract_and_save "batched_token_gen" "$OUTPUT"
-echo ""
-
-echo "=========================================="
-echo "All tests completed!"
-echo "Results saved to: $CSV_FILE"
-echo "=========================================="
diff --git a/utils/tune_gemm_config.py b/utils/tune_gemm_config.py
index 83b4218..e537cd8 100644
--- a/utils/tune_gemm_config.py
+++ b/utils/tune_gemm_config.py
@@ -35,85 +35,16 @@ class GemmTuner:
         shutil.copy2(self.backup_path, self.config_path)
         
     def generate_config(self, act_parallel, row_block_size, col_block_size, parallel_size):
-        """Generate new configuration file"""
+        """Generate new configuration file with simplified format"""
         content = ""
         
-        # ACT_PARALLEL definition
+        # Simplified configuration format
         if act_parallel:
             content += "#define ACT_PARALLEL\n"
-        else:
-            content += "// #define ACT_PARALLEL\n"
         
-        # Detect architecture branches in original config file
-        with open(self.backup_path, 'r') as f:
-            original = f.read()
-        
-        has_avx = "__AVX__" in original or "__AVX2__" in original
-        has_arm = "__ARM_NEON" in original
-        
-        # If architecture detection exists, generate corresponding branches
-        if has_avx and has_arm:
-            # Multi-architecture configuration
-            content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
-            content += "#if defined(ACT_PARALLEL)\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#else\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#endif\n"
-            content += "#elif defined(__ARM_NEON)\n"
-            content += "#if defined(ACT_PARALLEL)\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#else\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#endif\n"
-            content += "#endif\n"
-        elif has_avx:
-            # AVX architecture only
-            content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
-            content += "#if defined(ACT_PARALLEL)\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#else\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#endif\n"
-            content += "#endif\n"
-        elif has_arm:
-            # ARM architecture only
-            content += "#if defined(__ARM_NEON)\n"
-            content += "#if defined(ACT_PARALLEL)\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#else\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#endif\n"
-            content += "#endif\n"
-        else:
-            # No architecture detection, define directly
-            content += "#if defined(ACT_PARALLEL)\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#else\n"
-            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
-            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
-            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
-            content += "#endif\n"
-        
-        content += "\n"
+        content += f"#define ROW_BLOCK_SIZE {row_block_size}\n"
+        content += f"#define COL_BLOCK_SIZE {col_block_size}\n"
+        content += f"#define PARALLEL_SIZE {parallel_size}\n"
         
         with open(self.config_path, 'w') as f:
             f.write(content)
@@ -259,9 +190,12 @@ class GemmTuner:
             # Save results
             if output_csv is None:
                 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-                csv_path = f"stats/tuning_results_{timestamp}.csv"
+                csv_path = f"../stats/tuning_results_{timestamp}.csv"
             else:
                 csv_path = output_csv
+            
+            # Ensure stats directory exists
+            os.makedirs(os.path.dirname(csv_path), exist_ok=True)
             self.save_results(csv_path)
             
             # Find best configuration
@@ -278,8 +212,18 @@ class GemmTuner:
                 print(f"PP128 Throughput: {best['pp_throughput']:.2f} ± {best['pp_std_dev']:.2f} t/s")
                 print(f"{'='*80}\n")
                 
+                # Show the configuration that will be written
+                print("Configuration to be written to gemm-config.h:")
+                print("-" * 80)
+                if best['act_parallel']:
+                    print("#define ACT_PARALLEL")
+                print(f"#define ROW_BLOCK_SIZE {best['row_block_size']}")
+                print(f"#define COL_BLOCK_SIZE {best['col_block_size']}")
+                print(f"#define PARALLEL_SIZE {best['parallel_size']}")
+                print("-" * 80)
+                
                 # Apply best configuration
-                apply = input("Do you want to apply this configuration? (y/n): ").strip().lower()
+                apply = input("\nDo you want to apply this configuration to gemm-config.h? (y/n): ").strip().lower()
                 if apply == 'y':
                     self.generate_config(
                         best['act_parallel'],
@@ -288,17 +232,30 @@ class GemmTuner:
                         best['parallel_size']
                     )
                     self.rebuild_project()
-                    print("✅ Best configuration applied!")
+                    print("✅ Best configuration applied and project rebuilt!")
                 else:
                     self.restore_config()
                     print("✅ Original configuration restored")
+                
+                # Clean up backup file
+                if self.backup_path.exists():
+                    self.backup_path.unlink()
+                    print(f"🗑️  Removed backup file: {self.backup_path}")
             
         except KeyboardInterrupt:
             print("\n⚠️  Tuning interrupted by user")
             self.restore_config()
+            # Clean up backup file
+            if self.backup_path.exists():
+                self.backup_path.unlink()
+                print(f"🗑️  Removed backup file: {self.backup_path}")
         except Exception as e:
             print(f"\n❌ Error during tuning: {e}")
             self.restore_config()
+            # Clean up backup file
+            if self.backup_path.exists():
+                self.backup_path.unlink()
+                print(f"🗑️  Removed backup file: {self.backup_path}")
             raise
 
 
@@ -308,9 +265,9 @@ def generate_configurations():
     
     act_parallel_options = [True]
     
-    row_sizes = [2, 4, 8, 16, 32]
-    col_sizes = [32, 64, 128, 256, 512, 1024]
-    parallelism_degree = [2, 4, 8]
+    row_sizes = [2, 4, 8]#[2, 4, 8, 16, 32]
+    col_sizes = [32, 64]#[32, 64, 128, 256, 512, 1024]
+    parallelism_degree = [4]
     
     for act_parallel in act_parallel_options:
         for row in row_sizes: