From 41cc3048684923758f77a8434559f0d6c8f41e12 Mon Sep 17 00:00:00 2001
From: deva100
 <azureuser@a100dev2.vmlraidoxzcexnpbwxrwjxjste.px.internal.cloudapp.net>
Date: Tue, 23 Dec 2025 06:48:33 +0000
Subject: [PATCH] [chore] add some automation bash script for BitNet Tech
 Report

---
 demo_benchmark.sh       | 121 +++++++
 run_paper_benchmarks.sh | 720 ++++++++++++++++++++++++++++++++++++++++
 test_benchmark_setup.sh | 160 +++++++++
 3 files changed, 1001 insertions(+)
 create mode 100755 demo_benchmark.sh
 create mode 100755 run_paper_benchmarks.sh
 create mode 100755 test_benchmark_setup.sh

diff --git a/demo_benchmark.sh b/demo_benchmark.sh
new file mode 100755
index 0000000..8845a3f
--- /dev/null
+++ b/demo_benchmark.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+################################################################################
+# Quick Demo of Benchmark Automation
+# This runs a subset of benchmarks to verify the script works
+################################################################################
+
+set -euo pipefail
+
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+STATS_DIR="stats/demo_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "${STATS_DIR}"
+
+echo -e "${BLUE}========================================${NC}"
+echo -e "${BLUE}Quick Benchmark Demo${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo ""
+echo "Output directory: ${STATS_DIR}"
+echo ""
+
+# Test 1: Machine info
+echo -e "${GREEN}[1/3] Collecting machine info...${NC}"
+{
+    echo "=== Machine Information ==="
+    echo "Architecture: $(uname -m)"
+    echo "CPU cores: $(nproc)"
+    echo "Timestamp: $(date)"
+    echo ""
+    lscpu | head -20
+} | tee "${STATS_DIR}/machine_info.txt"
+echo ""
+
+# Test 2: Quick benchmark test
+echo -e "${GREEN}[2/3] Running quick benchmark (2 threads only)...${NC}"
+if [[ -f "build/bin/llama-bench" ]] && [[ -f "models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_q6_k.gguf" ]]; then
+    ./build/bin/llama-bench \
+        -m models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_q6_k.gguf \
+        -p 128 -n 128 -t 1,2,4 -ngl 0 \
+        2>&1 | tee "${STATS_DIR}/bench_quick.txt"
+    
+    # Parse results
+    {
+        echo "# Quick Benchmark Results"
+        echo ""
+        echo "| Threads | Test | Tokens/sec |"
+        echo "|---------|------|------------|"
+        
+        awk -F '|' '
+            /bitnet.*pp128/ || /bitnet.*tg128/ {
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $6);
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $7);
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $8);
+                split($8, perf, "±");
+                printf "| %7s | %4s | %10s |\n", $6, $7, perf[1];
+            }
+        ' "${STATS_DIR}/bench_quick.txt"
+    } > "${STATS_DIR}/bench_results.md"
+    
+    echo ""
+    echo -e "${GREEN}Results saved to: ${STATS_DIR}/bench_results.md${NC}"
+    cat "${STATS_DIR}/bench_results.md"
+else
+    echo "Skipping benchmark (model or binary not found)"
+fi
+echo ""
+
+# Test 3: Quick PPL test (one dataset only)
+echo -e "${GREEN}[3/3] Running quick PPL test (wikitext-2 only, 2 embed types)...${NC}"
+if [[ -f "build/bin/llama-perplexity" ]] && [[ -f "data/wikitext-2-raw/wiki.test.raw" ]]; then
+    {
+        echo "# Quick PPL Test"
+        echo ""
+        echo "| Embed Type | PPL |"
+        echo "|------------|-----|"
+        
+        for embed in i2_s q6_k; do
+            model="models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_${embed}.gguf"
+            if [[ -f "$model" ]]; then
+                echo "Testing: $embed..."
+                output=$(./build/bin/llama-perplexity \
+                    -m "$model" \
+                    -f data/wikitext-2-raw/wiki.test.raw \
+                    -t 4 -ngl 0 2>&1 || true)
+                
+                ppl=$(echo "$output" | awk '
+                    /Final estimate/ && /PPL/ {
+                        if (match($0, /PPL[[:space:]]*=[[:space:]]*([0-9]+(\.[0-9]+)?)/, m)) {
+                            print m[1];
+                            exit;
+                        }
+                    }
+                ')
+                
+                if [[ -n "$ppl" ]]; then
+                    echo "| $embed | $ppl |"
+                else
+                    echo "| $embed | N/A |"
+                fi
+            fi
+        done
+    } | tee "${STATS_DIR}/ppl_quick.md"
+    
+    echo ""
+    echo -e "${GREEN}Results saved to: ${STATS_DIR}/ppl_quick.md${NC}"
+else
+    echo "Skipping PPL test (binary or dataset not found)"
+fi
+echo ""
+
+echo -e "${BLUE}========================================${NC}"
+echo -e "${GREEN}Demo completed!${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo ""
+echo "All results in: ${STATS_DIR}/"
+echo ""
+echo "To run the full automation script:"
+echo "  ./run_paper_benchmarks.sh"
+echo ""
diff --git a/run_paper_benchmarks.sh b/run_paper_benchmarks.sh
new file mode 100755
index 0000000..975ddde
--- /dev/null
+++ b/run_paper_benchmarks.sh
@@ -0,0 +1,720 @@
+#!/bin/bash
+
+################################################################################
+# Paper Benchmark Automation Script
+# This script automates all experiments needed for the paper on both Intel and ARM
+################################################################################
+
+set -euo pipefail
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Configuration
+STATS_DIR="stats"
+MODEL_NAME="BitNet-b1.58-2B-4T"
+MODEL_DIR="models/${MODEL_NAME}"
+HF_REPO="microsoft/${MODEL_NAME}"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+MACHINE_INFO_FILE="${STATS_DIR}/machine_info_${TIMESTAMP}.txt"
+BENCH_RESULTS_FILE="${STATS_DIR}/bench_results_${TIMESTAMP}.md"
+BENCH_RAW_FILE="${STATS_DIR}/bench_raw_${TIMESTAMP}.txt"
+PPL_RESULTS_FILE="${STATS_DIR}/ppl_results_${TIMESTAMP}.md"
+PPL_CSV_FILE="${STATS_DIR}/ppl_results_${TIMESTAMP}.csv"
+
+# Create stats directory if not exists
+mkdir -p "${STATS_DIR}"
+
+################################################################################
+# Helper Functions
+################################################################################
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+log_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+section_header() {
+    echo ""
+    echo "================================================================================"
+    echo -e "${GREEN}$1${NC}"
+    echo "================================================================================"
+}
+
+################################################################################
+# Step 1: Machine Information and Environment Setup
+################################################################################
+
+step1_machine_info() {
+    section_header "STEP 1: Machine Information and Environment Setup"
+    
+    log_info "Collecting machine information..."
+    
+    {
+        echo "================================"
+        echo "Machine Information"
+        echo "================================"
+        echo "Timestamp: $(date)"
+        echo ""
+        
+        echo "--- System Architecture ---"
+        uname -a
+        echo ""
+        
+        echo "--- CPU Information ---"
+        if command -v lscpu &> /dev/null; then
+            lscpu
+        elif [[ -f /proc/cpuinfo ]]; then
+            cat /proc/cpuinfo
+        else
+            log_warning "Could not get CPU information"
+        fi
+        echo ""
+        
+        echo "--- CPU Cores ---"
+        NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "unknown")
+        echo "Number of CPU cores: ${NPROC}"
+        echo ""
+        
+        echo "--- Memory Information ---"
+        if command -v free &> /dev/null; then
+            free -h
+        elif command -v vm_stat &> /dev/null; then
+            vm_stat
+        else
+            log_warning "Could not get memory information"
+        fi
+        echo ""
+        
+        echo "--- Architecture Detection ---"
+        ARCH=$(uname -m)
+        echo "Architecture: ${ARCH}"
+        if [[ "${ARCH}" == "x86_64" ]]; then
+            echo "Platform: Intel/AMD x86_64"
+        elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then
+            echo "Platform: ARM64"
+        else
+            echo "Platform: Other (${ARCH})"
+        fi
+        echo ""
+        
+        echo "--- Compiler Information ---"
+        if command -v clang &> /dev/null; then
+            clang --version
+        fi
+        if command -v gcc &> /dev/null; then
+            gcc --version
+        fi
+        if command -v cmake &> /dev/null; then
+            cmake --version
+        fi
+        echo ""
+        
+        echo "--- Python Environment ---"
+        python --version || python3 --version
+        if command -v conda &> /dev/null; then
+            conda --version
+            echo "Active conda environment: ${CONDA_DEFAULT_ENV:-none}"
+        fi
+        echo ""
+        
+    } | tee "${MACHINE_INFO_FILE}"
+    
+    log_success "Machine information saved to: ${MACHINE_INFO_FILE}"
+    
+    # Install dependencies according to README
+    log_info "Installing Python dependencies..."
+    if [[ -f requirements.txt ]]; then
+        pip install -r requirements.txt
+        log_success "Python dependencies installed"
+    else
+        log_warning "requirements.txt not found, skipping dependency installation"
+    fi
+}
+
+################################################################################
+# Step 2: Build Project
+################################################################################
+
+step2_build() {
+    section_header "STEP 2: Building Project"
+    
+    log_info "Configuring CMake..."
+    cmake -B build -DCMAKE_BUILD_TYPE=Release
+    
+    log_info "Building project..."
+    cmake --build build --config Release
+    
+    log_success "Build completed successfully"
+}
+
+################################################################################
+# Step 3: Download and Convert Model
+################################################################################
+
+step3_download_convert() {
+    section_header "STEP 3: Download and Convert Model"
+    
+    if [[ -d "${MODEL_DIR}" ]] && [[ -f "${MODEL_DIR}/ggml-model-f32.gguf" ]]; then
+        log_warning "Model directory already exists and contains f32 model, skipping download"
+        read -p "Do you want to re-download and convert? (y/N): " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            return
+        fi
+    fi
+    
+    # Create model directory
+    mkdir -p "${MODEL_DIR}"
+    
+    # Download from HuggingFace
+    log_info "Downloading model from HuggingFace: ${HF_REPO}"
+    if command -v huggingface-cli &> /dev/null; then
+        huggingface-cli download "${HF_REPO}" --local-dir "${MODEL_DIR}"
+    else
+        log_error "huggingface-cli not found. Please install it with: pip install huggingface_hub"
+        exit 1
+    fi
+    
+    # Convert to f32 GGUF using the helper script
+    log_info "Converting model to f32 GGUF format..."
+    if [[ -f "utils/convert-helper-bitnet.py" ]]; then
+        # The script creates ggml-model-f32-bitnet.gguf, we'll rename it
+        python utils/convert-helper-bitnet.py "${MODEL_DIR}"
+        
+        # Rename the output to match expected name
+        if [[ -f "${MODEL_DIR}/ggml-model-f32-bitnet.gguf" ]]; then
+            mv "${MODEL_DIR}/ggml-model-f32-bitnet.gguf" "${MODEL_DIR}/ggml-model-f32.gguf"
+        fi
+    else
+        log_error "Convert helper script not found"
+        exit 1
+    fi
+    
+    log_success "Model downloaded and converted to f32 GGUF"
+}
+
+################################################################################
+# Step 4: Quantize Embeddings
+################################################################################
+
+step4_quantize_embeddings() {
+    section_header "STEP 4: Quantize Embeddings"
+    
+    log_info "Running embed_quant.sh to create different embedding quantization variants..."
+    
+    if [[ ! -f "embed_quant.sh" ]]; then
+        log_error "embed_quant.sh not found"
+        exit 1
+    fi
+    
+    bash embed_quant.sh
+    
+    log_success "Embedding quantization completed"
+}
+
+################################################################################
+# Step 5: Tune GEMM Block Sizes
+################################################################################
+
+step5_tune_gemm() {
+    section_header "STEP 5: Tune GEMM Block Sizes"
+    
+    log_info "Running GEMM block size tuning..."
+    
+    # Backup original tune script if needed
+    if [[ ! -f "tune_gemm_blocks.sh.bak" ]]; then
+        cp tune_gemm_blocks.sh tune_gemm_blocks.sh.bak
+    fi
+    
+    # Get number of threads
+    NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "8")
+    
+    # Update the tuning script to use a broader search space
+    log_info "Updating tune_gemm_blocks.sh for comprehensive search..."
+    
+    # Create a temporary tuning script with broader search
+    cat > tune_gemm_blocks_auto.sh << 'EOF'
+#!/bin/bash
+set -e
+
+HEADER_FILE="include/gemm-config.h"
+BENCH_CMD="./build/bin/llama-bench -m models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_i2_s.gguf -p 128 -n 0 -t 16 -ngl 0"
+BUILD_CMD="cmake --build build --config Release -j"
+
+ACT_PARALLEL_DEFINE=true
+
+# Expanded search space for better tuning
+ROW_BLOCK_VALUES=(2 4 8)
+COL_BLOCK_VALUES=(64 128 256)
+PARALLEL_SIZE_VALUES=(2 4 8)
+
+BEST_PERF=0
+BEST_ROW_BLOCK=0
+BEST_COL_BLOCK=0
+BEST_PARALLEL_SIZE=0
+LOG_FILE="stats/tuning_log.csv"
+
+if [ -f "$HEADER_FILE" ]; then
+    cp "$HEADER_FILE" "${HEADER_FILE}.bak"
+fi
+
+echo "Starting comprehensive tuning process..."
+echo "row_block,col_block,parallel_size,tokens_per_second" > "$LOG_FILE"
+
+cleanup() {
+    echo "Restoring original header file..."
+    if [ -f "${HEADER_FILE}.bak" ]; then
+        mv "${HEADER_FILE}.bak" "$HEADER_FILE"
+    fi
+    echo "Tuning finished."
+    echo "Best: ROW_BLOCK=${BEST_ROW_BLOCK}, COL_BLOCK=${BEST_COL_BLOCK}, PARALLEL=${BEST_PARALLEL_SIZE} -> ${BEST_PERF} tokens/s"
+}
+
+trap cleanup EXIT
+
+for ps in "${PARALLEL_SIZE_VALUES[@]}"; do
+    for rb in "${ROW_BLOCK_VALUES[@]}"; do
+        for cb in "${COL_BLOCK_VALUES[@]}"; do
+            echo "Testing: ROW=${rb}, COL=${cb}, PARALLEL=${ps}"
+            
+            echo "// Auto-generated by tuning script" > "$HEADER_FILE"
+            if [ "$ACT_PARALLEL_DEFINE" = "true" ]; then
+                echo "#define ACT_PARALLEL" >> "$HEADER_FILE"
+            fi
+            echo "#if defined(ACT_PARALLEL)" >> "$HEADER_FILE"
+            echo "    #define ROW_BLOCK_SIZE ${rb}" >> "$HEADER_FILE"
+            echo "    #define COL_BLOCK_SIZE ${cb}" >> "$HEADER_FILE"
+            echo "    #define PARALLEL_SIZE ${ps}" >> "$HEADER_FILE"
+            echo "#else" >> "$HEADER_FILE"
+            echo "    #define ROW_BLOCK_SIZE ${rb}" >> "$HEADER_FILE"
+            echo "    #define COL_BLOCK_SIZE ${cb}" >> "$HEADER_FILE"
+            echo "    #define PARALLEL_SIZE ${ps}" >> "$HEADER_FILE"
+            echo "#endif" >> "$HEADER_FILE"
+            
+            $BUILD_CMD > /dev/null 2>&1
+            
+            output=$(eval "$BENCH_CMD" 2>&1)
+            
+            perf=$(echo "$output" | awk -F '|' '
+                /pp128/ && /bitnet/ {
+                    gsub(/ /, "", $8);
+                    split($8, perf, "±");
+                    print perf[1];
+                    exit;
+                }
+            ')
+            
+            if [ -z "$perf" ]; then
+                perf=0
+            fi
+            
+            echo "Performance: ${perf} tokens/s"
+            echo "${rb},${cb},${ps},${perf}" >> "$LOG_FILE"
+            
+            if (( $(echo "$perf > $BEST_PERF" | bc -l) )); then
+                BEST_PERF=$perf
+                BEST_ROW_BLOCK=$rb
+                BEST_COL_BLOCK=$cb
+                BEST_PARALLEL_SIZE=$ps
+                echo "*** New best found! ***"
+            fi
+        done
+    done
+done
+
+echo "Best configuration: ROW=${BEST_ROW_BLOCK}, COL=${BEST_COL_BLOCK}, PARALLEL=${BEST_PARALLEL_SIZE}"
+echo "Best performance: ${BEST_PERF} tokens/s"
+EOF
+    
+    chmod +x tune_gemm_blocks_auto.sh
+    bash tune_gemm_blocks_auto.sh
+    
+    # Read the best configuration from the log
+    if [[ -f "stats/tuning_log.csv" ]]; then
+        BEST_CONFIG=$(tail -n +2 "stats/tuning_log.csv" | sort -t',' -k4 -nr | head -1)
+        BEST_ROW=$(echo "$BEST_CONFIG" | cut -d',' -f1)
+        BEST_COL=$(echo "$BEST_CONFIG" | cut -d',' -f2)
+        BEST_PAR=$(echo "$BEST_CONFIG" | cut -d',' -f3)
+        BEST_PERF=$(echo "$BEST_CONFIG" | cut -d',' -f4)
+        
+        log_success "Best configuration found:"
+        log_success "  ROW_BLOCK_SIZE=${BEST_ROW}, COL_BLOCK_SIZE=${BEST_COL}, PARALLEL_SIZE=${BEST_PAR}"
+        log_success "  Performance: ${BEST_PERF} tokens/s"
+        
+        # Apply the best configuration
+        log_info "Applying best configuration to gemm-config.h..."
+        cat > include/gemm-config.h << EOF
+// Auto-generated with best tuning results
+// Best performance: ${BEST_PERF} tokens/s
+#define ACT_PARALLEL
+#if defined(ACT_PARALLEL)
+    #define ROW_BLOCK_SIZE ${BEST_ROW}
+    #define COL_BLOCK_SIZE ${BEST_COL}
+    #define PARALLEL_SIZE ${BEST_PAR}
+#else
+    #define ROW_BLOCK_SIZE ${BEST_ROW}
+    #define COL_BLOCK_SIZE ${BEST_COL}
+    #define PARALLEL_SIZE ${BEST_PAR}
+#endif
+EOF
+        
+        # Rebuild with best configuration
+        log_info "Rebuilding with best configuration..."
+        cmake --build build --config Release -j
+        
+        log_success "GEMM tuning completed and applied"
+    else
+        log_error "Tuning log not found"
+    fi
+}
+
+################################################################################
+# Step 6: Run Performance Benchmarks
+################################################################################
+
+step6_benchmark() {
+    section_header "STEP 6: Running Performance Benchmarks"
+    
+    # Get number of threads for this machine
+    NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "8")
+    log_info "Detected ${NPROC} CPU cores"
+    
+    # Generate thread counts: 1, 2, 4, 8, 16, ...
+    THREAD_COUNTS="1"
+    for ((i=2; i<=NPROC; i*=2)); do
+        THREAD_COUNTS="${THREAD_COUNTS},${i}"
+    done
+    
+    log_info "Testing with thread counts: ${THREAD_COUNTS}"
+    
+    # Create benchmark script
+    cat > bench.sh << EOF
+#!/bin/bash
+set -e
+
+MODEL="${MODEL_DIR}/ggml-model-i2_s_embed_q6_k.gguf"
+THREADS="${THREAD_COUNTS}"
+
+if [[ ! -f "\${MODEL}" ]]; then
+    echo "Error: Model not found: \${MODEL}"
+    exit 1
+fi
+
+./build/bin/llama-bench -m "\${MODEL}" -p 128 -n 128 -t "\${THREADS}" -ngl 0
+EOF
+    
+    chmod +x bench.sh
+    
+    log_info "Running benchmark..."
+    
+    # Run benchmark and capture output
+    ./bench.sh 2>&1 | tee "${BENCH_RAW_FILE}"
+    
+    # Parse and format results
+    log_info "Parsing benchmark results..."
+    
+    {
+        echo "# Benchmark Results"
+        echo ""
+        echo "**Machine:** $(uname -m)"
+        echo "**Timestamp:** $(date)"
+        echo "**Model:** ${MODEL_NAME}"
+        echo "**Quantization:** I2_S weight, Q6_K embeddings"
+        echo ""
+        echo "## Performance Summary"
+        echo ""
+        echo "| Threads | Test Type | Tokens/sec | Std Dev |"
+        echo "|---------|-----------|------------|---------|"
+        
+        awk -F '|' '
+            /bitnet.*pp128/ || /bitnet.*tg128/ {
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $6);  # threads
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $7);  # test
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $8);  # t/s
+                
+                threads = $6;
+                test = $7;
+                
+                split($8, perf, "±");
+                tokens = perf[1];
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", tokens);
+                
+                stddev = perf[2];
+                gsub(/^[[:space:]]+|[[:space:]]+$/, "", stddev);
+                
+                printf "| %7s | %9s | %10s | %7s |\n", threads, test, tokens, stddev;
+            }
+        ' "${BENCH_RAW_FILE}"
+        
+        echo ""
+        echo "## Detailed Output"
+        echo ""
+        echo '```'
+        cat "${BENCH_RAW_FILE}"
+        echo '```'
+        
+    } > "${BENCH_RESULTS_FILE}"
+    
+    log_success "Benchmark results saved to: ${BENCH_RESULTS_FILE}"
+}
+
+################################################################################
+# Step 7: Run PPL Benchmarks
+################################################################################
+
+step7_ppl_benchmark() {
+    section_header "STEP 7: Running Perplexity (PPL) Benchmarks"
+    
+    log_info "Checking benchmark datasets..."
+    
+    # Check which datasets are available
+    DATASETS=""
+    for ds in data/wikitext-2-raw/wiki.test.raw data/ptb/ptb.test.txt data/lambada/lambada_test_plain_text.txt data/clue/tnews.test.txt; do
+        if [[ -f "$ds" ]]; then
+            DATASETS="${DATASETS} ${ds}"
+            log_info "Found dataset: ${ds}"
+        else
+            log_warning "Dataset not found: ${ds}"
+        fi
+    done
+    
+    if [[ -z "${DATASETS}" ]]; then
+        log_error "No benchmark datasets found in data/ directory"
+        log_warning "Skipping PPL benchmarks"
+        return
+    fi
+    
+    log_info "Creating PPL benchmark script..."
+    
+    # Create a modified PPL script
+    cat > embed_quant_ppl_auto.sh << 'EOFPPL'
+#!/usr/bin/env bash
+set -euo pipefail
+
+BIN="./build/bin/llama-perplexity"
+MODEL_DIR="models/BitNet-b1.58-2B-4T"
+MODEL_TEMPLATE="ggml-model-i2_s_embed_{ET}.gguf"
+
+EMBED_TYPES="f32 bf16 f16 i2_s q3_k q4_0 q5_0 q6_k tq1_0 tq2_0"
+DATASETS="DATASETS_PLACEHOLDER"
+
+THREADS="${THREADS:-16}"
+NGL="${NGL:-0}"
+
+CSV_LOG="ppl_results_temp.csv"
+
+if [[ ! -x "$BIN" ]]; then
+  echo "Error: llama-perplexity not found at $BIN" >&2
+  exit 1
+fi
+
+model_size_mib() {
+  local f="$1"
+  local sz
+  sz=$(stat -c %s "$f" 2>/dev/null || stat -f %z "$f" 2>/dev/null || echo 0)
+  awk -v b="$sz" 'BEGIN { printf("%.2f", b/1024/1024) }'
+}
+
+extract_ppl_final() {
+  awk '
+    /Final estimate/ && /PPL/ {
+      if (match($0, /PPL[[:space:]]*=[[:space:]]*([0-9]+(\.[0-9]+)?)\s*\+\/\-\s*([0-9]+(\.[0-9]+)?)/, m)) {
+        print m[1] "," m[3];
+        found=1;
+      }
+    }
+    END { if (!found) exit 1 }
+  '
+}
+
+extract_perplexity() {
+  awk '
+    {
+      for (i=1; i<=NF; ++i) {
+        if (tolower($i) ~ /perplexity/) {
+          for (j=i; j<=NF; ++j) {
+            if ($j ~ /^[0-9]+(\.[0-9]+)?$/) { p=$j; break }
+            gsub(/^.*=/, "", $j); gsub(/,$/, "", $j); gsub(/^\(/, "", $j); gsub(/\)$/, "", $j)
+            if ($j ~ /^[0-9]+(\.[0-9]+)?$/) { p=$j; break }
+          }
+        }
+      }
+      if (p) last=p
+    }
+    END { if (last) print last }'
+}
+
+echo "| embed-type |           model |   size | dataset | threads |        ppl |"
+echo "| ---------- | --------------: | -----: | ------: | ------: | ---------: |"
+echo "embed_type,model,model_size_mib,dataset,threads,perplexity,perplexity_err" > "$CSV_LOG"
+
+for et in $EMBED_TYPES; do
+  model_glob="${MODEL_DIR}/$(echo "$MODEL_TEMPLATE" | sed "s/{ET}/$et/")"
+  
+  found_any=0
+  for model in $model_glob; do
+    [[ -e "$model" ]] || continue
+    found_any=1
+  done
+  
+  if [[ $found_any -eq 0 ]]; then
+    echo "Warning: no models found for embed type '$et', skipping." >&2
+    continue
+  fi
+
+  for model in $model_glob; do
+    [[ -e "$model" ]] || continue
+    size_mib=$(model_size_mib "$model")
+
+    for ds in $DATASETS; do
+      if [[ ! -r "$ds" ]]; then
+        echo "Warning: dataset not found: $ds (skipping)" >&2
+        continue
+      fi
+
+      echo "==> Testing: model=$model, dataset=$ds"
+      out=$("$BIN" -m "$model" -f "$ds" -t "$THREADS" -ngl "$NGL" 2>&1 || true)
+
+      ppl_pair=$(echo "$out" | extract_ppl_final || true)
+      if [[ -n "${ppl_pair:-}" ]]; then
+        ppl="${ppl_pair%%,*}"
+        ppl_err="${ppl_pair##*,}"
+      else
+        ppl=$(echo "$out" | extract_perplexity || true)
+        if [[ -z "${ppl:-}" ]]; then
+          ppl="NA"
+        fi
+        ppl_err="NA"
+      fi
+
+      if [[ "$ppl_err" != "NA" ]]; then
+        ppl_disp="$ppl ± $ppl_err"
+      else
+        ppl_disp="$ppl"
+      fi
+
+      printf "| %10s | %14s | %6s MiB | %7s | %7s | %10s |\n" \
+        "$et" "$(basename "$model")" "$size_mib" "$(basename "$ds")" "$THREADS" "$ppl_disp"
+
+      echo "$et,$(basename "$model"),$size_mib,$(basename "$ds"),$THREADS,$ppl,$ppl_err" >> "$CSV_LOG"
+    done
+  done
+done
+
+echo "Done. Results saved to $CSV_LOG"
+EOFPPL
+    
+    # Replace DATASETS placeholder
+    sed -i "s|DATASETS_PLACEHOLDER|${DATASETS}|g" embed_quant_ppl_auto.sh
+    chmod +x embed_quant_ppl_auto.sh
+    
+    log_info "Running PPL benchmarks (this may take a while)..."
+    
+    # Run the PPL benchmark
+    ./embed_quant_ppl_auto.sh 2>&1 | tee "${PPL_RESULTS_FILE}.raw"
+    
+    # Format the results
+    {
+        echo "# Perplexity (PPL) Benchmark Results"
+        echo ""
+        echo "**Machine:** $(uname -m)"
+        echo "**Timestamp:** $(date)"
+        echo "**Model:** ${MODEL_NAME}"
+        echo ""
+        echo "## Results by Embedding Type"
+        echo ""
+        
+        grep "^|" "${PPL_RESULTS_FILE}.raw" || true
+        
+        echo ""
+        echo "## Summary Statistics"
+        echo ""
+        
+        if [[ -f "ppl_results_temp.csv" ]]; then
+            # Copy to final location
+            cp ppl_results_temp.csv "${PPL_CSV_FILE}"
+            
+            # Generate summary by embed type
+            echo "### Average PPL by Embedding Type"
+            echo ""
+            echo "| Embed Type | Avg PPL | Models Tested |"
+            echo "|------------|---------|---------------|"
+            
+            awk -F',' '
+                NR > 1 && $6 != "NA" {
+                    sum[$1] += $6;
+                    count[$1]++;
+                }
+                END {
+                    for (et in sum) {
+                        printf "| %10s | %7.2f | %13d |\n", et, sum[et]/count[et], count[et];
+                    }
+                }
+            ' "${PPL_CSV_FILE}" | sort -t'|' -k3 -n
+            
+            echo ""
+        fi
+        
+        echo "## Full Raw Output"
+        echo ""
+        echo '```'
+        cat "${PPL_RESULTS_FILE}.raw"
+        echo '```'
+        
+    } > "${PPL_RESULTS_FILE}"
+    
+    log_success "PPL results saved to: ${PPL_RESULTS_FILE}"
+    log_success "PPL CSV data saved to: ${PPL_CSV_FILE}"
+}
+
+################################################################################
+# Main Execution
+################################################################################
+
+main() {
+    section_header "Paper Benchmark Automation - Starting"
+    
+    log_info "All results will be saved to: ${STATS_DIR}/"
+    log_info "Timestamp: ${TIMESTAMP}"
+    
+    # Execute all steps
+    step1_machine_info
+    step2_build
+    step3_download_convert
+    step4_quantize_embeddings
+    step5_tune_gemm
+    step6_benchmark
+    step7_ppl_benchmark
+    
+    # Final summary
+    section_header "All Benchmarks Completed!"
+    
+    log_success "Results summary:"
+    log_success "  - Machine info:     ${MACHINE_INFO_FILE}"
+    log_success "  - Benchmark:        ${BENCH_RESULTS_FILE}"
+    log_success "  - PPL results:      ${PPL_RESULTS_FILE}"
+    log_success "  - PPL CSV:          ${PPL_CSV_FILE}"
+    log_success "  - GEMM tuning log:  stats/tuning_log.csv"
+    
+    echo ""
+    log_info "You can find all results in the ${STATS_DIR}/ directory"
+}
+
+# Run main function
+main "$@"
diff --git a/test_benchmark_setup.sh b/test_benchmark_setup.sh
new file mode 100755
index 0000000..0190cb3
--- /dev/null
+++ b/test_benchmark_setup.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+################################################################################
+# Quick Test Script for Benchmark Automation
+# This script tests individual components without running full benchmarks
+################################################################################
+
+set -euo pipefail
+
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo "========================================"
+echo "Testing Benchmark Automation Components"
+echo "========================================"
+echo ""
+
+# Test 1: Check system info
+echo "Test 1: System Information"
+echo "  Architecture: $(uname -m)"
+echo "  CPU cores: $(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 'unknown')"
+echo "  Python: $(python --version 2>&1 || python3 --version 2>&1)"
+if command -v cmake &> /dev/null; then
+    echo -e "  CMake: ${GREEN}✓${NC} $(cmake --version | head -1)"
+else
+    echo -e "  CMake: ${RED}✗ Not found${NC}"
+fi
+if command -v clang &> /dev/null; then
+    echo -e "  Clang: ${GREEN}✓${NC} $(clang --version | head -1)"
+else
+    echo -e "  Clang: ${RED}✗ Not found${NC}"
+fi
+echo ""
+
+# Test 2: Check required files
+echo "Test 2: Required Files"
+files=(
+    "embed_quant.sh"
+    "tune_gemm_blocks.sh"
+    "utils/convert-helper-bitnet.py"
+    "requirements.txt"
+)
+for f in "${files[@]}"; do
+    if [[ -f "$f" ]]; then
+        echo -e "  $f: ${GREEN}✓${NC}"
+    else
+        echo -e "  $f: ${RED}✗ Missing${NC}"
+    fi
+done
+echo ""
+
+# Test 3: Check build directory
+echo "Test 3: Build Status"
+if [[ -d "build" ]]; then
+    echo -e "  build/ directory: ${GREEN}✓${NC}"
+    if [[ -f "build/bin/llama-bench" ]]; then
+        echo -e "  llama-bench: ${GREEN}✓${NC}"
+    else
+        echo -e "  llama-bench: ${RED}✗ Not built${NC}"
+    fi
+    if [[ -f "build/bin/llama-perplexity" ]]; then
+        echo -e "  llama-perplexity: ${GREEN}✓${NC}"
+    else
+        echo -e "  llama-perplexity: ${RED}✗ Not built${NC}"
+    fi
+    if [[ -f "build/bin/llama-quantize" ]]; then
+        echo -e "  llama-quantize: ${GREEN}✓${NC}"
+    else
+        echo -e "  llama-quantize: ${RED}✗ Not built${NC}"
+    fi
+else
+    echo -e "  build/ directory: ${RED}✗ Not found${NC}"
+fi
+echo ""
+
+# Test 4: Check data directory
+echo "Test 4: Benchmark Datasets"
+datasets=(
+    "data/wikitext-2-raw/wiki.test.raw"
+    "data/ptb/ptb.test.txt"
+    "data/lambada/lambada_test_plain_text.txt"
+    "data/clue/tnews.test.txt"
+)
+found=0
+for ds in "${datasets[@]}"; do
+    if [[ -f "$ds" ]]; then
+        echo -e "  $(basename $(dirname $ds)): ${GREEN}✓${NC}"
+        found=$((found + 1))
+    else
+        echo -e "  $(basename $(dirname $ds)): ${RED}✗ Not found${NC}"
+    fi
+done
+echo "  Total: $found/4 datasets available"
+echo ""
+
+# Test 5: Check models
+echo "Test 5: Model Files"
+MODEL_DIR="models/BitNet-b1.58-2B-4T"
+if [[ -d "$MODEL_DIR" ]]; then
+    echo -e "  Model directory: ${GREEN}✓${NC}"
+    if [[ -f "$MODEL_DIR/ggml-model-f32.gguf" ]]; then
+        echo -e "  F32 model: ${GREEN}✓${NC}"
+    else
+        echo -e "  F32 model: ${RED}✗ Not found${NC}"
+    fi
+    
+    # Count quantized models
+    quant_count=$(ls "$MODEL_DIR"/ggml-model-i2_s_embed_*.gguf 2>/dev/null | wc -l)
+    if [[ $quant_count -gt 0 ]]; then
+        echo -e "  Quantized models: ${GREEN}✓${NC} ($quant_count files)"
+    else
+        echo -e "  Quantized models: ${RED}✗ None found${NC}"
+    fi
+else
+    echo -e "  Model directory: ${RED}✗ Not found${NC}"
+fi
+echo ""
+
+# Test 6: Thread count generation
+echo "Test 6: Thread Configuration"
+NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "8")
+THREAD_COUNTS="1"
+for ((i=2; i<=NPROC; i*=2)); do
+    THREAD_COUNTS="${THREAD_COUNTS},${i}"
+done
+echo "  Max threads: $NPROC"
+echo "  Test thread counts: $THREAD_COUNTS"
+echo ""
+
+# Test 7: Check stats directory
+echo "Test 7: Output Directory"
+if [[ -d "stats" ]]; then
+    echo -e "  stats/ directory: ${GREEN}✓${NC}"
+    file_count=$(ls stats/ 2>/dev/null | wc -l)
+    echo "  Files in stats/: $file_count"
+else
+    echo -e "  stats/ directory: ${RED}✗ Not found${NC}"
+    echo "  Creating stats/ directory..."
+    mkdir -p stats
+    echo -e "  ${GREEN}✓ Created${NC}"
+fi
+echo ""
+
+# Summary
+echo "========================================"
+echo "Test Summary"
+echo "========================================"
+echo ""
+echo "To run the full benchmark automation:"
+echo "  ./run_paper_benchmarks.sh"
+echo ""
+echo "To build the project first (if not built):"
+echo "  cmake -B build -DCMAKE_BUILD_TYPE=Release"
+echo "  cmake --build build --config Release"
+echo ""
+echo "To download and convert the model:"
+echo "  huggingface-cli download microsoft/BitNet-b1.58-2B-4T --local-dir models/BitNet-b1.58-2B-4T"
+echo "  python utils/convert-helper-bitnet.py models/BitNet-b1.58-2B-4T"
+echo ""