diff --git a/README.md b/README.md
index 4318061..bfb09a6 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,10 @@ bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.5
 
 The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.
 
-<img src="./assets/m2_performance.jpg" alt="m2_performance" width="800"/>
-<img src="./assets/intel_performance.jpg" alt="m2_performance" width="800"/>
+**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.5x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md).
+
+<img src="./assets/performance.png" alt="performance_comparison" width="800"/>
 
->The tested models are dummy setups used in a research context to demonstrate the inference performance of bitnet.cpp.
 
 ## Demo
 
@@ -214,7 +214,7 @@ optional arguments:
                         Directory to save the logging info
   --quant-type {i2_s,tl1}, -q {i2_s,tl1}
                         Quantization type
-  --quant-embd          Quantize the embeddings to q6_k
+  --quant-embd          Quantize the embeddings to f16
   --use-pretuned, -p    Use the pretuned kernel parameters
 </pre>
 ## Usage
diff --git a/assets/intel_performance.jpg b/assets/intel_performance.jpg
deleted file mode 100644
index 38a1bcf..0000000
Binary files a/assets/intel_performance.jpg and /dev/null differ
diff --git a/assets/m2_performance.jpg b/assets/m2_performance.jpg
deleted file mode 100644
index 9b59348..0000000
Binary files a/assets/m2_performance.jpg and /dev/null differ
diff --git a/assets/performance.png b/assets/performance.png
new file mode 100644
index 0000000..03d477d
Binary files /dev/null and b/assets/performance.png differ
diff --git a/demo_benchmark.sh b/demo_benchmark.sh
deleted file mode 100755
index dad999b..0000000
--- a/demo_benchmark.sh
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/bin/bash
-
-################################################################################
-# Quick Demo of Benchmark Automation
-# This runs a subset of benchmarks to verify the script works
-################################################################################
-
-set -euo pipefail
-
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-STATS_DIR="stats/demo_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "${STATS_DIR}"
-
-echo -e "${BLUE}========================================${NC}"
-echo -e "${BLUE}Quick Benchmark Demo (< 2 mins)${NC}"
-echo -e "${BLUE}========================================${NC}"
-echo ""
-echo "Output directory: ${STATS_DIR}"
-echo ""
-
-# Test 1: Machine info
-echo -e "${GREEN}[1/3] Collecting machine info...${NC}"
-{
-    echo "=== Machine Information ==="
-    echo "Architecture: $(uname -m)"
-    echo "CPU cores: $(nproc)"
-    echo "Timestamp: $(date)"
-    echo ""
-    lscpu | head -20
-} | tee "${STATS_DIR}/machine_info.txt"
-echo ""
-
-# Test 2: Quick benchmark test
-echo -e "${GREEN}[2/2] Running quick benchmark (single thread, minimal tokens)...${NC}"
-if [[ -f "build/bin/llama-bench" ]] && [[ -f "models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_q6_k.gguf" ]]; then
-    ./build/bin/llama-bench \
-        -m models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_q6_k.gguf \
-        -p 32 -n 32 -t 1 -ngl 0 \
-        2>&1 | tee "${STATS_DIR}/bench_quick.txt"
-    
-    # Parse results
-    {
-        echo "# Quick Benchmark Results"
-        echo ""
-        echo "| Threads | Test | Tokens/sec |"
-        echo "|---------|------|------------|"
-        
-        awk -F '|' '
-            /bitnet.*pp128/ || /bitnet.*tg128/ {
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $6);
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $7);
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $8);
-                split($8, perf, "±");
-                printf "| %7s | %4s | %10s |\n", $6, $7, perf[1];
-            }
-        ' "${STATS_DIR}/bench_quick.txt"
-    } > "${STATS_DIR}/bench_results.md"
-    
-    echo ""
-    echo -e "${GREEN}Results saved to: ${STATS_DIR}/bench_results.md${NC}"
-    cat "${STATS_DIR}/bench_results.md"
-else
-    echo "Skipping benchmark (model or binary not found)"
-fi
-echo ""
-
-# Test 3: Quick PPL test (using simplified dataset)
-echo -e "${GREEN}[3/3] Running quick PPL test (wiki.simple, 1 embed type)...${NC}"
-
-# Create simplified dataset if needed (first 100 lines for quick demo)
-if [[ -f "data/wikitext-2-raw/wiki.test.raw" ]]; then
-    echo "Creating simplified dataset (100 lines)..."
-    head -100 data/wikitext-2-raw/wiki.test.raw > data/wikitext-2-raw/wiki.simple.raw
-fi
-
-if [[ -f "build/bin/llama-perplexity" ]] && [[ -f "data/wikitext-2-raw/wiki.simple.raw" ]]; then
-    {
-        echo "# Quick PPL Test (Simplified Dataset)"
-        echo ""
-        echo "| Embed Type | Dataset | PPL |"
-        echo "|------------|---------|-----|"
-        
-        # Test only one embed type with simplified dataset for speed
-        embed="q6_k"
-        model="models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_${embed}.gguf"
-        if [[ -f "$model" ]]; then
-            echo "Testing: $embed on wiki.simple..."
-            output=$(./build/bin/llama-perplexity \
-                -m "$model" \
-                -f data/wikitext-2-raw/wiki.simple.raw \
-                -t 4 -ngl 0 2>&1 || true)
-            
-            ppl=$(echo "$output" | awk '
-                /Final estimate/ && /PPL/ {
-                    if (match($0, /PPL[[:space:]]*=[[:space:]]*([0-9]+(\.[0-9]+)?)/, m)) {
-                        print m[1];
-                        exit;
-                    }
-                }
-            ')
-            
-            if [[ -n "$ppl" ]]; then
-                echo "| $embed | wiki.simple | $ppl |"
-            else
-                echo "| $embed | wiki.simple | N/A |"
-            fi
-        fi
-    } | tee "${STATS_DIR}/ppl_quick.md"
-    
-    echo ""
-    echo -e "${GREEN}Results saved to: ${STATS_DIR}/ppl_quick.md${NC}"
-    cat "${STATS_DIR}/ppl_quick.md"
-else
-    echo "Skipping PPL test (binary or simplified dataset not found)"
-    echo "Note: Full PPL test available in: ./run_paper_benchmarks.sh"
-fi
-echo ""
-
-echo -e "${BLUE}========================================${NC}"
-echo -e "${GREEN}Demo completed! (Fast mode - PPL skipped)${NC}"
-echo -e "${BLUE}========================================${NC}"
-echo ""
-echo "All results in: ${STATS_DIR}/"
-echo ""
-echo "To run the full automation script:"
-echo "  ./run_paper_benchmarks.sh"
-echo ""
diff --git a/run_paper_benchmarks.sh b/run_paper_benchmarks.sh
deleted file mode 100755
index 975ddde..0000000
--- a/run_paper_benchmarks.sh
+++ /dev/null
@@ -1,720 +0,0 @@
-#!/bin/bash
-
-################################################################################
-# Paper Benchmark Automation Script
-# This script automates all experiments needed for the paper on both Intel and ARM
-################################################################################
-
-set -euo pipefail
-
-# Color codes for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-# Configuration
-STATS_DIR="stats"
-MODEL_NAME="BitNet-b1.58-2B-4T"
-MODEL_DIR="models/${MODEL_NAME}"
-HF_REPO="microsoft/${MODEL_NAME}"
-TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
-MACHINE_INFO_FILE="${STATS_DIR}/machine_info_${TIMESTAMP}.txt"
-BENCH_RESULTS_FILE="${STATS_DIR}/bench_results_${TIMESTAMP}.md"
-BENCH_RAW_FILE="${STATS_DIR}/bench_raw_${TIMESTAMP}.txt"
-PPL_RESULTS_FILE="${STATS_DIR}/ppl_results_${TIMESTAMP}.md"
-PPL_CSV_FILE="${STATS_DIR}/ppl_results_${TIMESTAMP}.csv"
-
-# Create stats directory if not exists
-mkdir -p "${STATS_DIR}"
-
-################################################################################
-# Helper Functions
-################################################################################
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[SUCCESS]${NC} $1"
-}
-
-log_warning() {
-    echo -e "${YELLOW}[WARNING]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-section_header() {
-    echo ""
-    echo "================================================================================"
-    echo -e "${GREEN}$1${NC}"
-    echo "================================================================================"
-}
-
-################################################################################
-# Step 1: Machine Information and Environment Setup
-################################################################################
-
-step1_machine_info() {
-    section_header "STEP 1: Machine Information and Environment Setup"
-    
-    log_info "Collecting machine information..."
-    
-    {
-        echo "================================"
-        echo "Machine Information"
-        echo "================================"
-        echo "Timestamp: $(date)"
-        echo ""
-        
-        echo "--- System Architecture ---"
-        uname -a
-        echo ""
-        
-        echo "--- CPU Information ---"
-        if command -v lscpu &> /dev/null; then
-            lscpu
-        elif [[ -f /proc/cpuinfo ]]; then
-            cat /proc/cpuinfo
-        else
-            log_warning "Could not get CPU information"
-        fi
-        echo ""
-        
-        echo "--- CPU Cores ---"
-        NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "unknown")
-        echo "Number of CPU cores: ${NPROC}"
-        echo ""
-        
-        echo "--- Memory Information ---"
-        if command -v free &> /dev/null; then
-            free -h
-        elif command -v vm_stat &> /dev/null; then
-            vm_stat
-        else
-            log_warning "Could not get memory information"
-        fi
-        echo ""
-        
-        echo "--- Architecture Detection ---"
-        ARCH=$(uname -m)
-        echo "Architecture: ${ARCH}"
-        if [[ "${ARCH}" == "x86_64" ]]; then
-            echo "Platform: Intel/AMD x86_64"
-        elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then
-            echo "Platform: ARM64"
-        else
-            echo "Platform: Other (${ARCH})"
-        fi
-        echo ""
-        
-        echo "--- Compiler Information ---"
-        if command -v clang &> /dev/null; then
-            clang --version
-        fi
-        if command -v gcc &> /dev/null; then
-            gcc --version
-        fi
-        if command -v cmake &> /dev/null; then
-            cmake --version
-        fi
-        echo ""
-        
-        echo "--- Python Environment ---"
-        python --version || python3 --version
-        if command -v conda &> /dev/null; then
-            conda --version
-            echo "Active conda environment: ${CONDA_DEFAULT_ENV:-none}"
-        fi
-        echo ""
-        
-    } | tee "${MACHINE_INFO_FILE}"
-    
-    log_success "Machine information saved to: ${MACHINE_INFO_FILE}"
-    
-    # Install dependencies according to README
-    log_info "Installing Python dependencies..."
-    if [[ -f requirements.txt ]]; then
-        pip install -r requirements.txt
-        log_success "Python dependencies installed"
-    else
-        log_warning "requirements.txt not found, skipping dependency installation"
-    fi
-}
-
-################################################################################
-# Step 2: Build Project
-################################################################################
-
-step2_build() {
-    section_header "STEP 2: Building Project"
-    
-    log_info "Configuring CMake..."
-    cmake -B build -DCMAKE_BUILD_TYPE=Release
-    
-    log_info "Building project..."
-    cmake --build build --config Release
-    
-    log_success "Build completed successfully"
-}
-
-################################################################################
-# Step 3: Download and Convert Model
-################################################################################
-
-step3_download_convert() {
-    section_header "STEP 3: Download and Convert Model"
-    
-    if [[ -d "${MODEL_DIR}" ]] && [[ -f "${MODEL_DIR}/ggml-model-f32.gguf" ]]; then
-        log_warning "Model directory already exists and contains f32 model, skipping download"
-        read -p "Do you want to re-download and convert? (y/N): " -n 1 -r
-        echo
-        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-            return
-        fi
-    fi
-    
-    # Create model directory
-    mkdir -p "${MODEL_DIR}"
-    
-    # Download from HuggingFace
-    log_info "Downloading model from HuggingFace: ${HF_REPO}"
-    if command -v huggingface-cli &> /dev/null; then
-        huggingface-cli download "${HF_REPO}" --local-dir "${MODEL_DIR}"
-    else
-        log_error "huggingface-cli not found. Please install it with: pip install huggingface_hub"
-        exit 1
-    fi
-    
-    # Convert to f32 GGUF using the helper script
-    log_info "Converting model to f32 GGUF format..."
-    if [[ -f "utils/convert-helper-bitnet.py" ]]; then
-        # The script creates ggml-model-f32-bitnet.gguf, we'll rename it
-        python utils/convert-helper-bitnet.py "${MODEL_DIR}"
-        
-        # Rename the output to match expected name
-        if [[ -f "${MODEL_DIR}/ggml-model-f32-bitnet.gguf" ]]; then
-            mv "${MODEL_DIR}/ggml-model-f32-bitnet.gguf" "${MODEL_DIR}/ggml-model-f32.gguf"
-        fi
-    else
-        log_error "Convert helper script not found"
-        exit 1
-    fi
-    
-    log_success "Model downloaded and converted to f32 GGUF"
-}
-
-################################################################################
-# Step 4: Quantize Embeddings
-################################################################################
-
-step4_quantize_embeddings() {
-    section_header "STEP 4: Quantize Embeddings"
-    
-    log_info "Running embed_quant.sh to create different embedding quantization variants..."
-    
-    if [[ ! -f "embed_quant.sh" ]]; then
-        log_error "embed_quant.sh not found"
-        exit 1
-    fi
-    
-    bash embed_quant.sh
-    
-    log_success "Embedding quantization completed"
-}
-
-################################################################################
-# Step 5: Tune GEMM Block Sizes
-################################################################################
-
-step5_tune_gemm() {
-    section_header "STEP 5: Tune GEMM Block Sizes"
-    
-    log_info "Running GEMM block size tuning..."
-    
-    # Backup original tune script if needed
-    if [[ ! -f "tune_gemm_blocks.sh.bak" ]]; then
-        cp tune_gemm_blocks.sh tune_gemm_blocks.sh.bak
-    fi
-    
-    # Get number of threads
-    NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "8")
-    
-    # Update the tuning script to use a broader search space
-    log_info "Updating tune_gemm_blocks.sh for comprehensive search..."
-    
-    # Create a temporary tuning script with broader search
-    cat > tune_gemm_blocks_auto.sh << 'EOF'
-#!/bin/bash
-set -e
-
-HEADER_FILE="include/gemm-config.h"
-BENCH_CMD="./build/bin/llama-bench -m models/BitNet-b1.58-2B-4T/ggml-model-i2_s_embed_i2_s.gguf -p 128 -n 0 -t 16 -ngl 0"
-BUILD_CMD="cmake --build build --config Release -j"
-
-ACT_PARALLEL_DEFINE=true
-
-# Expanded search space for better tuning
-ROW_BLOCK_VALUES=(2 4 8)
-COL_BLOCK_VALUES=(64 128 256)
-PARALLEL_SIZE_VALUES=(2 4 8)
-
-BEST_PERF=0
-BEST_ROW_BLOCK=0
-BEST_COL_BLOCK=0
-BEST_PARALLEL_SIZE=0
-LOG_FILE="stats/tuning_log.csv"
-
-if [ -f "$HEADER_FILE" ]; then
-    cp "$HEADER_FILE" "${HEADER_FILE}.bak"
-fi
-
-echo "Starting comprehensive tuning process..."
-echo "row_block,col_block,parallel_size,tokens_per_second" > "$LOG_FILE"
-
-cleanup() {
-    echo "Restoring original header file..."
-    if [ -f "${HEADER_FILE}.bak" ]; then
-        mv "${HEADER_FILE}.bak" "$HEADER_FILE"
-    fi
-    echo "Tuning finished."
-    echo "Best: ROW_BLOCK=${BEST_ROW_BLOCK}, COL_BLOCK=${BEST_COL_BLOCK}, PARALLEL=${BEST_PARALLEL_SIZE} -> ${BEST_PERF} tokens/s"
-}
-
-trap cleanup EXIT
-
-for ps in "${PARALLEL_SIZE_VALUES[@]}"; do
-    for rb in "${ROW_BLOCK_VALUES[@]}"; do
-        for cb in "${COL_BLOCK_VALUES[@]}"; do
-            echo "Testing: ROW=${rb}, COL=${cb}, PARALLEL=${ps}"
-            
-            echo "// Auto-generated by tuning script" > "$HEADER_FILE"
-            if [ "$ACT_PARALLEL_DEFINE" = "true" ]; then
-                echo "#define ACT_PARALLEL" >> "$HEADER_FILE"
-            fi
-            echo "#if defined(ACT_PARALLEL)" >> "$HEADER_FILE"
-            echo "    #define ROW_BLOCK_SIZE ${rb}" >> "$HEADER_FILE"
-            echo "    #define COL_BLOCK_SIZE ${cb}" >> "$HEADER_FILE"
-            echo "    #define PARALLEL_SIZE ${ps}" >> "$HEADER_FILE"
-            echo "#else" >> "$HEADER_FILE"
-            echo "    #define ROW_BLOCK_SIZE ${rb}" >> "$HEADER_FILE"
-            echo "    #define COL_BLOCK_SIZE ${cb}" >> "$HEADER_FILE"
-            echo "    #define PARALLEL_SIZE ${ps}" >> "$HEADER_FILE"
-            echo "#endif" >> "$HEADER_FILE"
-            
-            $BUILD_CMD > /dev/null 2>&1
-            
-            output=$(eval "$BENCH_CMD" 2>&1)
-            
-            perf=$(echo "$output" | awk -F '|' '
-                /pp128/ && /bitnet/ {
-                    gsub(/ /, "", $8);
-                    split($8, perf, "±");
-                    print perf[1];
-                    exit;
-                }
-            ')
-            
-            if [ -z "$perf" ]; then
-                perf=0
-            fi
-            
-            echo "Performance: ${perf} tokens/s"
-            echo "${rb},${cb},${ps},${perf}" >> "$LOG_FILE"
-            
-            if (( $(echo "$perf > $BEST_PERF" | bc -l) )); then
-                BEST_PERF=$perf
-                BEST_ROW_BLOCK=$rb
-                BEST_COL_BLOCK=$cb
-                BEST_PARALLEL_SIZE=$ps
-                echo "*** New best found! ***"
-            fi
-        done
-    done
-done
-
-echo "Best configuration: ROW=${BEST_ROW_BLOCK}, COL=${BEST_COL_BLOCK}, PARALLEL=${BEST_PARALLEL_SIZE}"
-echo "Best performance: ${BEST_PERF} tokens/s"
-EOF
-    
-    chmod +x tune_gemm_blocks_auto.sh
-    bash tune_gemm_blocks_auto.sh
-    
-    # Read the best configuration from the log
-    if [[ -f "stats/tuning_log.csv" ]]; then
-        BEST_CONFIG=$(tail -n +2 "stats/tuning_log.csv" | sort -t',' -k4 -nr | head -1)
-        BEST_ROW=$(echo "$BEST_CONFIG" | cut -d',' -f1)
-        BEST_COL=$(echo "$BEST_CONFIG" | cut -d',' -f2)
-        BEST_PAR=$(echo "$BEST_CONFIG" | cut -d',' -f3)
-        BEST_PERF=$(echo "$BEST_CONFIG" | cut -d',' -f4)
-        
-        log_success "Best configuration found:"
-        log_success "  ROW_BLOCK_SIZE=${BEST_ROW}, COL_BLOCK_SIZE=${BEST_COL}, PARALLEL_SIZE=${BEST_PAR}"
-        log_success "  Performance: ${BEST_PERF} tokens/s"
-        
-        # Apply the best configuration
-        log_info "Applying best configuration to gemm-config.h..."
-        cat > include/gemm-config.h << EOF
-// Auto-generated with best tuning results
-// Best performance: ${BEST_PERF} tokens/s
-#define ACT_PARALLEL
-#if defined(ACT_PARALLEL)
-    #define ROW_BLOCK_SIZE ${BEST_ROW}
-    #define COL_BLOCK_SIZE ${BEST_COL}
-    #define PARALLEL_SIZE ${BEST_PAR}
-#else
-    #define ROW_BLOCK_SIZE ${BEST_ROW}
-    #define COL_BLOCK_SIZE ${BEST_COL}
-    #define PARALLEL_SIZE ${BEST_PAR}
-#endif
-EOF
-        
-        # Rebuild with best configuration
-        log_info "Rebuilding with best configuration..."
-        cmake --build build --config Release -j
-        
-        log_success "GEMM tuning completed and applied"
-    else
-        log_error "Tuning log not found"
-    fi
-}
-
-################################################################################
-# Step 6: Run Performance Benchmarks
-################################################################################
-
-step6_benchmark() {
-    section_header "STEP 6: Running Performance Benchmarks"
-    
-    # Get number of threads for this machine
-    NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "8")
-    log_info "Detected ${NPROC} CPU cores"
-    
-    # Generate thread counts: 1, 2, 4, 8, 16, ...
-    THREAD_COUNTS="1"
-    for ((i=2; i<=NPROC; i*=2)); do
-        THREAD_COUNTS="${THREAD_COUNTS},${i}"
-    done
-    
-    log_info "Testing with thread counts: ${THREAD_COUNTS}"
-    
-    # Create benchmark script
-    cat > bench.sh << EOF
-#!/bin/bash
-set -e
-
-MODEL="${MODEL_DIR}/ggml-model-i2_s_embed_q6_k.gguf"
-THREADS="${THREAD_COUNTS}"
-
-if [[ ! -f "\${MODEL}" ]]; then
-    echo "Error: Model not found: \${MODEL}"
-    exit 1
-fi
-
-./build/bin/llama-bench -m "\${MODEL}" -p 128 -n 128 -t "\${THREADS}" -ngl 0
-EOF
-    
-    chmod +x bench.sh
-    
-    log_info "Running benchmark..."
-    
-    # Run benchmark and capture output
-    ./bench.sh 2>&1 | tee "${BENCH_RAW_FILE}"
-    
-    # Parse and format results
-    log_info "Parsing benchmark results..."
-    
-    {
-        echo "# Benchmark Results"
-        echo ""
-        echo "**Machine:** $(uname -m)"
-        echo "**Timestamp:** $(date)"
-        echo "**Model:** ${MODEL_NAME}"
-        echo "**Quantization:** I2_S weight, Q6_K embeddings"
-        echo ""
-        echo "## Performance Summary"
-        echo ""
-        echo "| Threads | Test Type | Tokens/sec | Std Dev |"
-        echo "|---------|-----------|------------|---------|"
-        
-        awk -F '|' '
-            /bitnet.*pp128/ || /bitnet.*tg128/ {
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $6);  # threads
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $7);  # test
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", $8);  # t/s
-                
-                threads = $6;
-                test = $7;
-                
-                split($8, perf, "±");
-                tokens = perf[1];
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", tokens);
-                
-                stddev = perf[2];
-                gsub(/^[[:space:]]+|[[:space:]]+$/, "", stddev);
-                
-                printf "| %7s | %9s | %10s | %7s |\n", threads, test, tokens, stddev;
-            }
-        ' "${BENCH_RAW_FILE}"
-        
-        echo ""
-        echo "## Detailed Output"
-        echo ""
-        echo '```'
-        cat "${BENCH_RAW_FILE}"
-        echo '```'
-        
-    } > "${BENCH_RESULTS_FILE}"
-    
-    log_success "Benchmark results saved to: ${BENCH_RESULTS_FILE}"
-}
-
-################################################################################
-# Step 7: Run PPL Benchmarks
-################################################################################
-
-step7_ppl_benchmark() {
-    section_header "STEP 7: Running Perplexity (PPL) Benchmarks"
-    
-    log_info "Checking benchmark datasets..."
-    
-    # Check which datasets are available
-    DATASETS=""
-    for ds in data/wikitext-2-raw/wiki.test.raw data/ptb/ptb.test.txt data/lambada/lambada_test_plain_text.txt data/clue/tnews.test.txt; do
-        if [[ -f "$ds" ]]; then
-            DATASETS="${DATASETS} ${ds}"
-            log_info "Found dataset: ${ds}"
-        else
-            log_warning "Dataset not found: ${ds}"
-        fi
-    done
-    
-    if [[ -z "${DATASETS}" ]]; then
-        log_error "No benchmark datasets found in data/ directory"
-        log_warning "Skipping PPL benchmarks"
-        return
-    fi
-    
-    log_info "Creating PPL benchmark script..."
-    
-    # Create a modified PPL script
-    cat > embed_quant_ppl_auto.sh << 'EOFPPL'
-#!/usr/bin/env bash
-set -euo pipefail
-
-BIN="./build/bin/llama-perplexity"
-MODEL_DIR="models/BitNet-b1.58-2B-4T"
-MODEL_TEMPLATE="ggml-model-i2_s_embed_{ET}.gguf"
-
-EMBED_TYPES="f32 bf16 f16 i2_s q3_k q4_0 q5_0 q6_k tq1_0 tq2_0"
-DATASETS="DATASETS_PLACEHOLDER"
-
-THREADS="${THREADS:-16}"
-NGL="${NGL:-0}"
-
-CSV_LOG="ppl_results_temp.csv"
-
-if [[ ! -x "$BIN" ]]; then
-  echo "Error: llama-perplexity not found at $BIN" >&2
-  exit 1
-fi
-
-model_size_mib() {
-  local f="$1"
-  local sz
-  sz=$(stat -c %s "$f" 2>/dev/null || stat -f %z "$f" 2>/dev/null || echo 0)
-  awk -v b="$sz" 'BEGIN { printf("%.2f", b/1024/1024) }'
-}
-
-extract_ppl_final() {
-  awk '
-    /Final estimate/ && /PPL/ {
-      if (match($0, /PPL[[:space:]]*=[[:space:]]*([0-9]+(\.[0-9]+)?)\s*\+\/\-\s*([0-9]+(\.[0-9]+)?)/, m)) {
-        print m[1] "," m[3];
-        found=1;
-      }
-    }
-    END { if (!found) exit 1 }
-  '
-}
-
-extract_perplexity() {
-  awk '
-    {
-      for (i=1; i<=NF; ++i) {
-        if (tolower($i) ~ /perplexity/) {
-          for (j=i; j<=NF; ++j) {
-            if ($j ~ /^[0-9]+(\.[0-9]+)?$/) { p=$j; break }
-            gsub(/^.*=/, "", $j); gsub(/,$/, "", $j); gsub(/^\(/, "", $j); gsub(/\)$/, "", $j)
-            if ($j ~ /^[0-9]+(\.[0-9]+)?$/) { p=$j; break }
-          }
-        }
-      }
-      if (p) last=p
-    }
-    END { if (last) print last }'
-}
-
-echo "| embed-type |           model |   size | dataset | threads |        ppl |"
-echo "| ---------- | --------------: | -----: | ------: | ------: | ---------: |"
-echo "embed_type,model,model_size_mib,dataset,threads,perplexity,perplexity_err" > "$CSV_LOG"
-
-for et in $EMBED_TYPES; do
-  model_glob="${MODEL_DIR}/$(echo "$MODEL_TEMPLATE" | sed "s/{ET}/$et/")"
-  
-  found_any=0
-  for model in $model_glob; do
-    [[ -e "$model" ]] || continue
-    found_any=1
-  done
-  
-  if [[ $found_any -eq 0 ]]; then
-    echo "Warning: no models found for embed type '$et', skipping." >&2
-    continue
-  fi
-
-  for model in $model_glob; do
-    [[ -e "$model" ]] || continue
-    size_mib=$(model_size_mib "$model")
-
-    for ds in $DATASETS; do
-      if [[ ! -r "$ds" ]]; then
-        echo "Warning: dataset not found: $ds (skipping)" >&2
-        continue
-      fi
-
-      echo "==> Testing: model=$model, dataset=$ds"
-      out=$("$BIN" -m "$model" -f "$ds" -t "$THREADS" -ngl "$NGL" 2>&1 || true)
-
-      ppl_pair=$(echo "$out" | extract_ppl_final || true)
-      if [[ -n "${ppl_pair:-}" ]]; then
-        ppl="${ppl_pair%%,*}"
-        ppl_err="${ppl_pair##*,}"
-      else
-        ppl=$(echo "$out" | extract_perplexity || true)
-        if [[ -z "${ppl:-}" ]]; then
-          ppl="NA"
-        fi
-        ppl_err="NA"
-      fi
-
-      if [[ "$ppl_err" != "NA" ]]; then
-        ppl_disp="$ppl ± $ppl_err"
-      else
-        ppl_disp="$ppl"
-      fi
-
-      printf "| %10s | %14s | %6s MiB | %7s | %7s | %10s |\n" \
-        "$et" "$(basename "$model")" "$size_mib" "$(basename "$ds")" "$THREADS" "$ppl_disp"
-
-      echo "$et,$(basename "$model"),$size_mib,$(basename "$ds"),$THREADS,$ppl,$ppl_err" >> "$CSV_LOG"
-    done
-  done
-done
-
-echo "Done. Results saved to $CSV_LOG"
-EOFPPL
-    
-    # Replace DATASETS placeholder
-    sed -i "s|DATASETS_PLACEHOLDER|${DATASETS}|g" embed_quant_ppl_auto.sh
-    chmod +x embed_quant_ppl_auto.sh
-    
-    log_info "Running PPL benchmarks (this may take a while)..."
-    
-    # Run the PPL benchmark
-    ./embed_quant_ppl_auto.sh 2>&1 | tee "${PPL_RESULTS_FILE}.raw"
-    
-    # Format the results
-    {
-        echo "# Perplexity (PPL) Benchmark Results"
-        echo ""
-        echo "**Machine:** $(uname -m)"
-        echo "**Timestamp:** $(date)"
-        echo "**Model:** ${MODEL_NAME}"
-        echo ""
-        echo "## Results by Embedding Type"
-        echo ""
-        
-        grep "^|" "${PPL_RESULTS_FILE}.raw" || true
-        
-        echo ""
-        echo "## Summary Statistics"
-        echo ""
-        
-        if [[ -f "ppl_results_temp.csv" ]]; then
-            # Copy to final location
-            cp ppl_results_temp.csv "${PPL_CSV_FILE}"
-            
-            # Generate summary by embed type
-            echo "### Average PPL by Embedding Type"
-            echo ""
-            echo "| Embed Type | Avg PPL | Models Tested |"
-            echo "|------------|---------|---------------|"
-            
-            awk -F',' '
-                NR > 1 && $6 != "NA" {
-                    sum[$1] += $6;
-                    count[$1]++;
-                }
-                END {
-                    for (et in sum) {
-                        printf "| %10s | %7.2f | %13d |\n", et, sum[et]/count[et], count[et];
-                    }
-                }
-            ' "${PPL_CSV_FILE}" | sort -t'|' -k3 -n
-            
-            echo ""
-        fi
-        
-        echo "## Full Raw Output"
-        echo ""
-        echo '```'
-        cat "${PPL_RESULTS_FILE}.raw"
-        echo '```'
-        
-    } > "${PPL_RESULTS_FILE}"
-    
-    log_success "PPL results saved to: ${PPL_RESULTS_FILE}"
-    log_success "PPL CSV data saved to: ${PPL_CSV_FILE}"
-}
-
-################################################################################
-# Main Execution
-################################################################################
-
-main() {
-    section_header "Paper Benchmark Automation - Starting"
-    
-    log_info "All results will be saved to: ${STATS_DIR}/"
-    log_info "Timestamp: ${TIMESTAMP}"
-    
-    # Execute all steps
-    step1_machine_info
-    step2_build
-    step3_download_convert
-    step4_quantize_embeddings
-    step5_tune_gemm
-    step6_benchmark
-    step7_ppl_benchmark
-    
-    # Final summary
-    section_header "All Benchmarks Completed!"
-    
-    log_success "Results summary:"
-    log_success "  - Machine info:     ${MACHINE_INFO_FILE}"
-    log_success "  - Benchmark:        ${BENCH_RESULTS_FILE}"
-    log_success "  - PPL results:      ${PPL_RESULTS_FILE}"
-    log_success "  - PPL CSV:          ${PPL_CSV_FILE}"
-    log_success "  - GEMM tuning log:  stats/tuning_log.csv"
-    
-    echo ""
-    log_info "You can find all results in the ${STATS_DIR}/ directory"
-}
-
-# Run main function
-main "$@"
diff --git a/setup_env.py b/setup_env.py
index 7d84ed7..f15d65f 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -136,12 +136,12 @@ def prepare_model():
             # quantize to i2s
             if platform.system() != "Windows":
                 if quant_embd:
-                    run_command(["./build/bin/llama-quantize", "--token-embedding-type", "q6_k", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
+                    run_command(["./build/bin/llama-quantize", "--token-embedding-type", "f16", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
                 else:
                     run_command(["./build/bin/llama-quantize", f32_model, i2s_model, "I2_S", "1"], log_step="quantize_to_i2s")
             else:
                 if quant_embd:
-                    run_command(["./build/bin/Release/llama-quantize", "--token-embedding-type", "q6_k", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
+                    run_command(["./build/bin/Release/llama-quantize", "--token-embedding-type", "f16", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
                 else:
                     run_command(["./build/bin/Release/llama-quantize", f32_model, i2s_model, "I2_S", "1"], log_step="quantize_to_i2s")
 
@@ -228,7 +228,7 @@ def parse_args():
     parser.add_argument("--model-dir", "-md", type=str, help="Directory to save/load the model", default="models")
     parser.add_argument("--log-dir", "-ld", type=str, help="Directory to save the logging info", default="logs")
     parser.add_argument("--quant-type", "-q", type=str, help="Quantization type", choices=SUPPORTED_QUANT_TYPES[arch], default="i2_s")
-    parser.add_argument("--quant-embd", action="store_true", help="Quantize the embeddings to q6_k")
+    parser.add_argument("--quant-embd", action="store_true", help="Quantize the embeddings to f16")
     parser.add_argument("--use-pretuned", "-p", action="store_true", help="Use the pretuned kernel parameters")
     return parser.parse_args()
 
diff --git a/test_benchmark_setup.sh b/test_benchmark_setup.sh
deleted file mode 100755
index 0190cb3..0000000
--- a/test_benchmark_setup.sh
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/bin/bash
-
-################################################################################
-# Quick Test Script for Benchmark Automation
-# This script tests individual components without running full benchmarks
-################################################################################
-
-set -euo pipefail
-
-GREEN='\033[0;32m'
-RED='\033[0;31m'
-NC='\033[0m'
-
-echo "========================================"
-echo "Testing Benchmark Automation Components"
-echo "========================================"
-echo ""
-
-# Test 1: Check system info
-echo "Test 1: System Information"
-echo "  Architecture: $(uname -m)"
-echo "  CPU cores: $(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 'unknown')"
-echo "  Python: $(python --version 2>&1 || python3 --version 2>&1)"
-if command -v cmake &> /dev/null; then
-    echo -e "  CMake: ${GREEN}✓${NC} $(cmake --version | head -1)"
-else
-    echo -e "  CMake: ${RED}✗ Not found${NC}"
-fi
-if command -v clang &> /dev/null; then
-    echo -e "  Clang: ${GREEN}✓${NC} $(clang --version | head -1)"
-else
-    echo -e "  Clang: ${RED}✗ Not found${NC}"
-fi
-echo ""
-
-# Test 2: Check required files
-echo "Test 2: Required Files"
-files=(
-    "embed_quant.sh"
-    "tune_gemm_blocks.sh"
-    "utils/convert-helper-bitnet.py"
-    "requirements.txt"
-)
-for f in "${files[@]}"; do
-    if [[ -f "$f" ]]; then
-        echo -e "  $f: ${GREEN}✓${NC}"
-    else
-        echo -e "  $f: ${RED}✗ Missing${NC}"
-    fi
-done
-echo ""
-
-# Test 3: Check build directory
-echo "Test 3: Build Status"
-if [[ -d "build" ]]; then
-    echo -e "  build/ directory: ${GREEN}✓${NC}"
-    if [[ -f "build/bin/llama-bench" ]]; then
-        echo -e "  llama-bench: ${GREEN}✓${NC}"
-    else
-        echo -e "  llama-bench: ${RED}✗ Not built${NC}"
-    fi
-    if [[ -f "build/bin/llama-perplexity" ]]; then
-        echo -e "  llama-perplexity: ${GREEN}✓${NC}"
-    else
-        echo -e "  llama-perplexity: ${RED}✗ Not built${NC}"
-    fi
-    if [[ -f "build/bin/llama-quantize" ]]; then
-        echo -e "  llama-quantize: ${GREEN}✓${NC}"
-    else
-        echo -e "  llama-quantize: ${RED}✗ Not built${NC}"
-    fi
-else
-    echo -e "  build/ directory: ${RED}✗ Not found${NC}"
-fi
-echo ""
-
-# Test 4: Check data directory
-echo "Test 4: Benchmark Datasets"
-datasets=(
-    "data/wikitext-2-raw/wiki.test.raw"
-    "data/ptb/ptb.test.txt"
-    "data/lambada/lambada_test_plain_text.txt"
-    "data/clue/tnews.test.txt"
-)
-found=0
-for ds in "${datasets[@]}"; do
-    if [[ -f "$ds" ]]; then
-        echo -e "  $(basename $(dirname $ds)): ${GREEN}✓${NC}"
-        found=$((found + 1))
-    else
-        echo -e "  $(basename $(dirname $ds)): ${RED}✗ Not found${NC}"
-    fi
-done
-echo "  Total: $found/4 datasets available"
-echo ""
-
-# Test 5: Check models
-echo "Test 5: Model Files"
-MODEL_DIR="models/BitNet-b1.58-2B-4T"
-if [[ -d "$MODEL_DIR" ]]; then
-    echo -e "  Model directory: ${GREEN}✓${NC}"
-    if [[ -f "$MODEL_DIR/ggml-model-f32.gguf" ]]; then
-        echo -e "  F32 model: ${GREEN}✓${NC}"
-    else
-        echo -e "  F32 model: ${RED}✗ Not found${NC}"
-    fi
-    
-    # Count quantized models
-    quant_count=$(ls "$MODEL_DIR"/ggml-model-i2_s_embed_*.gguf 2>/dev/null | wc -l)
-    if [[ $quant_count -gt 0 ]]; then
-        echo -e "  Quantized models: ${GREEN}✓${NC} ($quant_count files)"
-    else
-        echo -e "  Quantized models: ${RED}✗ None found${NC}"
-    fi
-else
-    echo -e "  Model directory: ${RED}✗ Not found${NC}"
-fi
-echo ""
-
-# Test 6: Thread count generation
-echo "Test 6: Thread Configuration"
-NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "8")
-THREAD_COUNTS="1"
-for ((i=2; i<=NPROC; i*=2)); do
-    THREAD_COUNTS="${THREAD_COUNTS},${i}"
-done
-echo "  Max threads: $NPROC"
-echo "  Test thread counts: $THREAD_COUNTS"
-echo ""
-
-# Test 7: Check stats directory
-echo "Test 7: Output Directory"
-if [[ -d "stats" ]]; then
-    echo -e "  stats/ directory: ${GREEN}✓${NC}"
-    file_count=$(ls stats/ 2>/dev/null | wc -l)
-    echo "  Files in stats/: $file_count"
-else
-    echo -e "  stats/ directory: ${RED}✗ Not found${NC}"
-    echo "  Creating stats/ directory..."
-    mkdir -p stats
-    echo -e "  ${GREEN}✓ Created${NC}"
-fi
-echo ""
-
-# Summary
-echo "========================================"
-echo "Test Summary"
-echo "========================================"
-echo ""
-echo "To run the full benchmark automation:"
-echo "  ./run_paper_benchmarks.sh"
-echo ""
-echo "To build the project first (if not built):"
-echo "  cmake -B build -DCMAKE_BUILD_TYPE=Release"
-echo "  cmake --build build --config Release"
-echo ""
-echo "To download and convert the model:"
-echo "  huggingface-cli download microsoft/BitNet-b1.58-2B-4T --local-dir models/BitNet-b1.58-2B-4T"
-echo "  python utils/convert-helper-bitnet.py models/BitNet-b1.58-2B-4T"
-echo ""
diff --git a/utils/build_test_gemm_kernel.sh b/utils/build_test_gemm_kernel.sh
new file mode 100755
index 0000000..bc45942
--- /dev/null
+++ b/utils/build_test_gemm_kernel.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Build script for standalone GEMM kernel benchmark
+
+set -e
+
+echo "Building GEMM kernel benchmark..."
+
+# Compiler settings
+CXX=${CXX:-g++}
+BUILD_DIR="../build"
+SRC_DIR="../src"
+
+# Create build directory if it doesn't exist
+mkdir -p ${BUILD_DIR}
+
+# Compiler flags
+CXXFLAGS="-O3 -march=native -mtune=native -std=c++17 -fopenmp"
+CXXFLAGS+=" -I.. -I../include"
+CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/include"
+CXXFLAGS+=" -I../3rdparty/llama.cpp/ggml/src"
+CXXFLAGS+=" -I../3rdparty/llama.cpp/include"
+CXXFLAGS+=" -DNDEBUG -ffast-math"
+
+# Link flags
+LDFLAGS="-lm -lpthread"
+
+# Link with pre-built libraries
+GGML_LIB_DIR="../build/3rdparty/llama.cpp/ggml/src"
+GGML_SO="${GGML_LIB_DIR}/libggml.so"
+
+if [ ! -f "${GGML_SO}" ]; then
+    echo "⚠️  Warning: Cannot find libggml.so"
+    echo "Please build the project first with: cmake --build build"
+    exit 1
+fi
+
+LDFLAGS+=" -L${GGML_LIB_DIR} -lggml -Wl,-rpath,\$ORIGIN/../../${GGML_LIB_DIR}"
+echo "Linking with libggml.so"
+
+# Source files
+SOURCES="./test_gemm_kernel.cpp"
+
+# Output binary
+OUTPUT="${BUILD_DIR}/test_gemm_kernel"
+
+echo "Compiler: ${CXX}"
+echo "Flags: ${CXXFLAGS}"
+echo "Sources: ${SOURCES}"
+echo ""
+
+# Build
+${CXX} ${CXXFLAGS} ${SOURCES} -o ${OUTPUT} ${LDFLAGS}
+
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "✅ Build successful!"
+    echo "Output: ${OUTPUT}"
+    echo ""
+    echo "Usage examples:"
+    echo "  # Default test (n=2048, nr=32, nc=128, 1000 iterations)"
+    echo "  ${OUTPUT}"
+    echo ""
+    echo "  # Custom matrix sizes"
+    echo "  ${OUTPUT} -n 4096 -r 64 -c 256"
+    echo ""
+    echo "  # Quick test (fewer iterations)"
+    echo "  ${OUTPUT} -i 100 -w 5"
+    echo ""
+    echo "  # Large-scale test"
+    echo "  ${OUTPUT} -n 3200 -r 128 -c 512 -i 500"
+    echo ""
+else
+    echo ""
+    echo "❌ Build failed!"
+    exit 1
+fi
diff --git a/utils/convert-helper-bitnet.py b/utils/convert-helper-bitnet.py
index 5b4149a..9ed8db0 100644
--- a/utils/convert-helper-bitnet.py
+++ b/utils/convert-helper-bitnet.py
@@ -109,12 +109,12 @@ def main():
             except OSError as e:
                 print(f"Warning: Could not remove {preprocessed_output_file}: {e}")
         
-        if gguf_f32_output.exists():
-            print(f"Removing f32 GGUF: {gguf_f32_output}")
-            try:
-                gguf_f32_output.unlink()
-            except OSError as e:
-                print(f"Warning: Could not remove {gguf_f32_output}: {e}")
+        # if gguf_f32_output.exists():
+        #     print(f"Removing f32 GGUF: {gguf_f32_output}")
+        #     try:
+        #         gguf_f32_output.unlink()
+        #     except OSError as e:
+        #         print(f"Warning: Could not remove {gguf_f32_output}: {e}")
         
         if input_backup_file.exists():
             if not input_file.exists():
diff --git a/utils/quantize_embeddings.py b/utils/quantize_embeddings.py
new file mode 100644
index 0000000..90b8020
--- /dev/null
+++ b/utils/quantize_embeddings.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+"""
+Embedding Quantization Script
+This script converts ggml-model-f32.gguf to multiple quantized versions
+with different token embedding types.
+"""
+
+import subprocess
+import os
+import argparse
+import re
+import csv
+from pathlib import Path
+from datetime import datetime
+
+
+class EmbeddingQuantizer:
+    def __init__(self, input_model, output_dir, quantize_bin="../build/bin/llama-quantize", 
+                 bench_bin="../build/bin/llama-bench", stats_dir="../stats", csv_output=None):
+        self.input_model = Path(input_model)
+        self.output_dir = Path(output_dir)
+        self.quantize_bin = Path(quantize_bin)
+        self.bench_bin = Path(bench_bin)
+        self.stats_dir = Path(stats_dir)
+        self.csv_output = Path(csv_output) if csv_output else None
+        
+        # Verify input file exists
+        if not self.input_model.exists():
+            raise FileNotFoundError(f"Input model not found: {self.input_model}")
+        
+        # Verify quantize tool exists
+        if not self.quantize_bin.exists():
+            raise FileNotFoundError(f"Quantize binary not found: {self.quantize_bin}")
+        
+        # Verify bench tool exists
+        if not self.bench_bin.exists():
+            raise FileNotFoundError(f"Benchmark binary not found: {self.bench_bin}")
+        
+        # Create output directories
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.stats_dir.mkdir(parents=True, exist_ok=True)
+        
+        self.results = []
+        self.newly_created_files = set()  # Track newly created files
+        
+    def quantize(self, embedding_type, output_suffix):
+        """
+        Perform single quantization
+        
+        Args:
+            embedding_type: Token embedding type (uppercase format, e.g., Q6_K)
+            output_suffix: Output file suffix (lowercase format, e.g., q6_k)
+        
+        Returns:
+            bool: Whether successful
+        """
+        output_file = self.output_dir / f"ggml-model-i2_s-embed-{output_suffix}.gguf"
+        
+        # Check if file already exists
+        file_already_existed = output_file.exists()
+        
+        if file_already_existed:
+            print(f"ℹ️  File already exists: {output_file}")
+            print(f"   Skipping quantization, will use existing file for benchmark")
+            return True
+        
+        cmd = [
+            str(self.quantize_bin),
+            "--token-embedding-type", embedding_type,
+            str(self.input_model),
+            str(output_file),
+            "I2_S",
+            "1",
+            "1"
+        ]
+        
+        print(f"\n{'='*80}")
+        print(f"🔄 Quantizing with embedding type: {embedding_type}")
+        print(f"📥 Input:  {self.input_model}")
+        print(f"📤 Output: {output_file}")
+        print(f"💻 Command: {' '.join(cmd)}")
+        print(f"{'='*80}\n")
+        
+        start_time = datetime.now()
+        
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                cwd=os.getcwd(),
+                timeout=600  # 10 minute timeout
+            )
+            
+            end_time = datetime.now()
+            duration = (end_time - start_time).total_seconds()
+            
+            if result.returncode == 0:
+                # Get output file size
+                file_size_mb = output_file.stat().st_size / (1024 * 1024)
+                
+                print(f"✅ Success! Duration: {duration:.2f}s, Size: {file_size_mb:.2f} MB")
+                
+                # Record newly created file
+                if not file_already_existed:
+                    self.newly_created_files.add(output_file)
+                
+                # Print part of output
+                if result.stdout:
+                    print("\n📊 Quantization output:")
+                    print(result.stdout[-500:] if len(result.stdout) > 500 else result.stdout)
+                
+                return True
+            else:
+                print(f"❌ Failed with return code {result.returncode}")
+                print(f"Error: {result.stderr}")
+                return False
+                
+        except subprocess.TimeoutExpired:
+            print(f"❌ Timeout (exceeded 10 minutes)")
+            return False
+            
+        except Exception as e:
+            print(f"❌ Exception: {e}")
+            return False
+    
+    def benchmark_model(self, output_suffix):
+        """
+        Benchmark model
+        
+        Args:
+            output_suffix: Output file suffix (lowercase format, e.g., q6_k)
+        
+        Returns:
+            dict: Dictionary with benchmark results, or None if failed
+        """
+        model_file = self.output_dir / f"ggml-model-i2_s-embed-{output_suffix}.gguf"
+        
+        if not model_file.exists():
+            print(f"❌ Model file not found for benchmarking: {model_file}")
+            return None
+        
+        cmd = [
+            str(self.bench_bin),
+            "-m", str(model_file),
+            "-p", "128",
+            "-n", "0",
+            "-t", "1,2,4,8",
+            "-ngl", "0"
+        ]
+        
+        print(f"\n{'='*80}")
+        print(f"🏃 Running benchmark for: {output_suffix}")
+        print(f"💻 Command: {' '.join(cmd)}")
+        print(f"{'='*80}\n")
+        
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                cwd=os.getcwd(),
+                timeout=300  # 5 minute timeout
+            )
+            
+            if result.returncode == 0:
+                print("✅ Benchmark completed successfully")
+                print("\n📊 Benchmark output:")
+                print(result.stdout)
+                
+                # 解析输出
+                bench_results = self.parse_benchmark_output(result.stdout, output_suffix)
+                return bench_results
+            else:
+                print(f"❌ Benchmark failed with return code {result.returncode}")
+                print(f"Error: {result.stderr}")
+                return None
+                
+        except subprocess.TimeoutExpired:
+            print(f"❌ Benchmark timeout (exceeded 5 minutes)")
+            return None
+            
+        except Exception as e:
+            print(f"❌ Benchmark exception: {e}")
+            return None
+    
+    def parse_benchmark_output(self, output, output_suffix):
+        """
+        Parse benchmark output to extract t/s data (mean±std)
+        
+        Args:
+            output: Benchmark command output
+            output_suffix: Output file suffix
+        
+        Returns:
+            dict: Dictionary with parsed results
+        """
+        results = {
+            'embedding_type': output_suffix,
+            'threads_1': None,
+            'threads_2': None,
+            'threads_4': None,
+            'threads_8': None,
+        }
+        
+        # Parse table data
+        # Find lines containing pp128 and t/s
+        lines = output.strip().split('\n')
+        
+        for line in lines:
+            # Skip header and separator lines
+            if '|' not in line or 'model' in line or '---' in line:
+                continue
+            
+            # Try to extract data
+            # Format similar to: | bitnet-25 2B I2_S - 2 bpw ternary | 1012.28 MiB |     2.74 B | CPU        |      12 |         pp128 |        405.73 ± 3.69 |
+            parts = [p.strip() for p in line.split('|')]
+            
+            if len(parts) >= 8 and 'pp128' in parts[6]:
+                threads_str = parts[5].strip()
+                throughput_str = parts[7].strip()
+                
+                # Extract thread count
+                try:
+                    threads = int(threads_str)
+                except:
+                    continue
+                
+                # Extract t/s data (format: "405.73 ± 3.69" or "405.73")
+                # Try to match "mean ± std" format
+                match_with_std = re.search(r'([\d.]+)\s*±\s*([\d.]+)', throughput_str)
+                if match_with_std:
+                    mean = float(match_with_std.group(1))
+                    std = float(match_with_std.group(2))
+                    throughput = f"{mean:.2f}±{std:.2f}"
+                else:
+                    # Only mean, no std
+                    match = re.search(r'([\d.]+)', throughput_str)
+                    if match:
+                        throughput = f"{float(match.group(1)):.2f}"
+                    else:
+                        continue
+                
+                # Store result based on thread count
+                if threads == 1:
+                    results['threads_1'] = throughput
+                elif threads == 2:
+                    results['threads_2'] = throughput
+                elif threads == 4:
+                    results['threads_4'] = throughput
+                elif threads == 8:
+                    results['threads_8'] = throughput
+        
+        return results
+    
+    def cleanup_model(self, output_suffix):
+        """
+        Cleanup model files (only delete newly created files)
+        
+        Args:
+            output_suffix: Output file suffix
+        """
+        model_file = self.output_dir / f"ggml-model-i2_s-embed-{output_suffix}.gguf"
+        
+        if model_file in self.newly_created_files:
+            try:
+                model_file.unlink()
+                print(f"🗑️  Deleted newly created file: {model_file}")
+                self.newly_created_files.remove(model_file)
+            except Exception as e:
+                print(f"⚠️  Failed to delete {model_file}: {e}")
+        else:
+            print(f"ℹ️  Keeping existing file: {model_file}")
+    
+    def run_all_quantizations(self, types_to_quantize):
+        """
+        Run all quantizations
+        
+        Args:
+            types_to_quantize: List of quantization types, tuples of (embedding_type, output_suffix)
+        """
+        print(f"\n{'='*80}")
+        print(f"🚀 Starting Embedding Quantization and Benchmarking")
+        print(f"{'='*80}")
+        print(f"📥 Input model: {self.input_model}")
+        print(f"📤 Output directory: {self.output_dir}")
+        print(f"📊 Stats directory: {self.stats_dir}")
+        print(f"🔢 Total quantizations: {len(types_to_quantize)}")
+        print(f"{'='*80}\n")
+        
+        total_start = datetime.now()
+        
+        for i, (embedding_type, output_suffix) in enumerate(types_to_quantize, 1):
+            print(f"\n{'#'*80}")
+            print(f"[{i}/{len(types_to_quantize)}] Processing {output_suffix} ({embedding_type})")
+            print(f"{'#'*80}\n")
+            
+            # Quantize model
+            success = self.quantize(embedding_type, output_suffix)
+            
+            if not success:
+                print(f"⚠️  Skipping benchmark for {output_suffix} due to quantization failure")
+                continue
+            
+            # Run benchmark
+            bench_results = self.benchmark_model(output_suffix)
+            
+            if bench_results:
+                self.results.append(bench_results)
+            else:
+                print(f"⚠️  Benchmark failed for {output_suffix}")
+            
+            # Cleanup model files (only delete newly created files)
+            self.cleanup_model(output_suffix)
+            
+            print(f"\n{'#'*80}")
+            print(f"✅ Completed {output_suffix}")
+            print(f"{'#'*80}\n")
+        
+        total_end = datetime.now()
+        total_duration = (total_end - total_start).total_seconds()
+        
+        # 保存结果到CSV
+        self.save_results_to_csv()
+        
+        # 打印总结
+        self.print_summary(total_duration)
+    
+    def save_results_to_csv(self):
+        """将benchmark结果保存到CSV文件"""
+        if not self.results:
+            print("⚠️  No results to save")
+            return
+        
+        # Use user-specified CSV path, otherwise use default path
+        if self.csv_output:
+            csv_file = self.csv_output
+            # Ensure parent directory exists
+            csv_file.parent.mkdir(parents=True, exist_ok=True)
+        else:
+            csv_file = self.stats_dir / f"embedding_benchmark.csv"
+        
+        print(f"\n💾 Saving results to: {csv_file}")
+        
+        try:
+            with open(csv_file, 'w', newline='') as f:
+                fieldnames = ['embedding_type', 'threads_1', 'threads_2', 'threads_4', 'threads_8']
+                writer = csv.DictWriter(f, fieldnames=fieldnames)
+                
+                writer.writeheader()
+                for result in self.results:
+                    writer.writerow(result)
+            
+            print(f"✅ Results saved successfully")
+            
+            # Also print table
+            print(f"\n📊 Benchmark Results:")
+            print(f"{'Type':<15} {'1 thread':<18} {'2 threads':<18} {'4 threads':<18} {'8 threads':<18}")
+            print("-" * 87)
+            for result in self.results:
+                t1 = result['threads_1'] if result['threads_1'] else "N/A"
+                t2 = result['threads_2'] if result['threads_2'] else "N/A"
+                t4 = result['threads_4'] if result['threads_4'] else "N/A"
+                t8 = result['threads_8'] if result['threads_8'] else "N/A"
+                print(f"{result['embedding_type']:<15} {t1:<18} {t2:<18} {t4:<18} {t8:<18}")
+                
+        except Exception as e:
+            print(f"❌ Failed to save results: {e}")
+        
+    def print_summary(self, total_duration):
+        """Print quantization summary"""
+        print(f"\n\n{'='*80}")
+        print(f"📊 QUANTIZATION AND BENCHMARK SUMMARY")
+        print(f"{'='*80}\n")
+        
+        successful = len(self.results)
+        total = len(self.results)
+        
+        print(f"✅ Completed: {successful} benchmarks")
+        print(f"⏱️  Total duration: {total_duration/60:.2f} minutes\n")
+        
+        if self.results:
+            if self.csv_output and self.csv_output.exists():
+                print(f"📁 Results saved to: {self.csv_output}")
+            else:
+                csv_files = list(self.stats_dir.glob("embedding_benchmark*.csv"))
+                if csv_files:
+                    latest_csv = max(csv_files, key=lambda p: p.stat().st_mtime)
+                    print(f"📁 Results saved to: {latest_csv}")
+        
+        print(f"\n{'='*80}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Quantize model embeddings to multiple formats')
+    parser.add_argument('--input', '-i',
+                        default='../models/BitNet-b1.58-2B-4T/ggml-model-f32.gguf',
+                        help='Input model path (default: ../models/BitNet-b1.58-2B-4T/ggml-model-f32.gguf)')
+    parser.add_argument('--output-dir', '-o',
+                        default='../models/BitNet-b1.58-2B-4T',
+                        help='Output directory (default: ../models/BitNet-b1.58-2B-4T)')
+    parser.add_argument('--quantize-bin', '-q',
+                        default='../build/bin/llama-quantize',
+                        help='Path to llama-quantize binary (default: ../build/bin/llama-quantize)')
+    parser.add_argument('--bench-bin', '-b',
+                        default='../build/bin/llama-bench',
+                        help='Path to llama-bench binary (default: ../build/bin/llama-bench)')
+    parser.add_argument('--stats-dir',
+                        default='../stats',
+                        help='Directory to save benchmark results (default: ../stats)')
+    parser.add_argument('--csv-output', '-c',
+                        help='Custom path for CSV output file (e.g., stats/my_results.csv)')
+    parser.add_argument('--types', '-t',
+                        nargs='+',
+                        help='Specific types to quantize (e.g., f32 q6_k q4_0)')
+    parser.add_argument('--skip-existing', '-s',
+                        action='store_true',
+                        help='Skip quantization if output file already exists (will still benchmark existing files)')
+    
+    args = parser.parse_args()
+    
+    # Define all supported quantization types
+    # Format: (embedding_type for command line, output_suffix for filename)
+    all_types = [
+        ('F32', 'f32'),
+        ('F16', 'f16'),
+        ('Q8_0', 'q8_0'),
+        ('Q6_K', 'q6_k'),
+        ('Q5_0', 'q5_0'),
+        ('Q4_0', 'q4_0'),
+        ('Q3_K', 'q3_k'),
+        ('TQ2_0', 'tq2_0'),
+    ]
+    
+    # If specific types are specified, filter the list
+    if args.types:
+        types_lower = [t.lower() for t in args.types]
+        types_to_quantize = [(et, os) for et, os in all_types if os.lower() in types_lower]
+        if not types_to_quantize:
+            print(f"❌ No valid types specified. Available types: {', '.join([os for _, os in all_types])}")
+            return
+    else:
+        types_to_quantize = all_types
+    
+    # If skip existing files is enabled, no need to filter
+    # Because new logic will automatically detect and skip during quantization, but will still benchmark
+    
+    # 创建量化器并运行
+    try:
+        quantizer = EmbeddingQuantizer(
+            args.input, 
+            args.output_dir, 
+            args.quantize_bin,
+            args.bench_bin,
+            args.stats_dir,
+            args.csv_output
+        )
+        quantizer.run_all_quantizations(types_to_quantize)
+    except FileNotFoundError as e:
+        print(f"❌ Error: {e}")
+        return 1
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Quantization interrupted by user")
+        return 1
+    except Exception as e:
+        print(f"\n❌ Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main() or 0)
diff --git a/utils/test_gemm_kernel.cpp b/utils/test_gemm_kernel.cpp
new file mode 100644
index 0000000..36964ce
--- /dev/null
+++ b/utils/test_gemm_kernel.cpp
@@ -0,0 +1,274 @@
+/**
+ * Standalone benchmark for ggml_gemm_i2_i8_s kernel
+ * 
+ * This program tests the performance of the ggml_gemm_i2_i8_s kernel
+ * with configurable matrix sizes and iteration counts.
+ * 
+ * Usage: ./test_gemm_kernel [options]
+ *   -n <size>   : embedding dimension (must be divisible by 4, default: 2048)
+ *   -r <rows>   : number of rows in matrix Y (default: 32)
+ *   -c <cols>   : number of columns in matrix X (default: 128)
+ *   -i <iters>  : number of iterations (default: 1000)
+ *   -w <warmup> : number of warmup iterations (default: 10)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+
+// Include necessary headers
+#include "../include/gemm-config.h"
+
+// Function declarations (from ggml-quants.h)
+extern "C" void ggml_vec_dot_i2_i8_s(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc);
+
+// GEMM kernel definition
+void ggml_gemm_i2_i8_s(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+#if defined(ACT_PARALLEL)
+    const int64_t row_block = ROW_BLOCK_SIZE;
+    const int64_t col_block = COL_BLOCK_SIZE;
+
+    for (int64_t c0 = 0; c0 < nc; c0 += col_block) {
+        int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0);
+        for (int64_t r0 = 0; r0 < nr; r0 += row_block) {
+            int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0);
+            const void * vy_r = (const uint8_t *)vy + r0 * n;
+            for (int64_t c = 0; c < cur_c; ++c) {
+                const int64_t col = c0 + c;
+                float * s_col = s + col;
+                const void * vx_col = (const uint8_t *)vx + col * n / 4;
+                ggml_vec_dot_i2_i8_s(n, s_col + r0 * bs, bs, vx_col, n, vy_r, n, cur_r);
+            }
+        }
+    }
+#else
+    const int64_t row_block = ROW_BLOCK_SIZE;
+    const int64_t col_block = COL_BLOCK_SIZE;
+
+    for (int64_t r0 = 0; r0 < nr; r0 += row_block) {
+        int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0);
+        for (int64_t c0 = 0; c0 < nc; c0 += col_block) {
+            int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0);
+            const void * vx_c = (const uint8_t *)vx + c0 * n / 4;
+            for (int64_t r = 0; r < cur_r; ++r) {
+                const int64_t row = r0 + r;
+                float * s_row = s + row * bs;
+                const void * vy_row = (const uint8_t *)vy + row * n;
+                ggml_vec_dot_i2_i8_s(n, s_row + c0, bs, vx_c, n, vy_row, n, cur_c);
+            }
+        }
+    }
+#endif
+}
+
+// Helper function to get current time in nanoseconds
+double get_time_ns() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * 1e9 + ts.tv_nsec;
+}
+
+// Initialize matrix with random i2 values (2-bit quantized)
+void init_matrix_i2(uint8_t* data, int n, int cols) {
+    // i2 format: 4 values per byte (2 bits each)
+    int total_bytes = n * cols / 4;
+    for (int i = 0; i < total_bytes; i++) {
+        data[i] = rand() & 0xFF;
+    }
+}
+
+// Initialize matrix with random i8 values
+void init_matrix_i8(int8_t* data, int n, int rows) {
+    int total_elements = n * rows;
+    for (int i = 0; i < total_elements; i++) {
+        data[i] = (int8_t)((rand() % 256) - 128);
+    }
+}
+
+// Benchmark configuration
+struct BenchmarkConfig {
+    int n;           // embedding dimension (must be divisible by 4)
+    int nr;          // number of rows in Y matrix
+    int nc;          // number of columns in X matrix
+    int iterations;  // number of benchmark iterations
+    int warmup;      // number of warmup iterations
+};
+
+void print_config(const BenchmarkConfig& config) {
+    printf("=" "=%.78s\n", "===============================================================================");
+    printf("Benchmark Configuration:\n");
+    printf("=" "=%.78s\n", "===============================================================================");
+    printf("  Embedding dimension (n)    : %d\n", config.n);
+    printf("  Matrix Y rows (nr)         : %d\n", config.nr);
+    printf("  Matrix X columns (nc)      : %d\n", config.nc);
+    printf("  Iterations                 : %d\n", config.iterations);
+    printf("  Warmup iterations          : %d\n", config.warmup);
+    printf("\nMatrix sizes:\n");
+    printf("  X (i2): %d x %d (%.2f KB)\n", config.nc, config.n, 
+           (config.nc * config.n / 4) / 1024.0);
+    printf("  Y (i8): %d x %d (%.2f KB)\n", config.nr, config.n,
+           (config.nr * config.n) / 1024.0);
+    printf("  S (f32): %d x %d (%.2f KB)\n", config.nr, config.nc,
+           (config.nr * config.nc * sizeof(float)) / 1024.0);
+    printf("\nGEMM Config:\n");
+#if defined(ACT_PARALLEL)
+    printf("  ACT_PARALLEL              : ON\n");
+#else
+    printf("  ACT_PARALLEL              : OFF\n");
+#endif
+    printf("  ROW_BLOCK_SIZE            : %d\n", ROW_BLOCK_SIZE);
+    printf("  COL_BLOCK_SIZE            : %d\n", COL_BLOCK_SIZE);
+    printf("  PARALLEL_SIZE             : %d\n", PARALLEL_SIZE);
+    printf("=" "=%.78s\n\n", "===============================================================================");
+}
+
+void run_benchmark(const BenchmarkConfig& config) {
+    // Allocate matrices
+    printf("Allocating matrices...\n");
+    
+    // X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes
+    uint8_t* X = (uint8_t*)malloc(config.nc * config.n / 4);
+    
+    // Y matrix (i8 format): nr x n
+    int8_t* Y = (int8_t*)malloc(config.nr * config.n);
+    
+    // Result matrix (float32): nr x nc
+    float* S = (float*)malloc(config.nr * config.nc * sizeof(float));
+    
+    if (!X || !Y || !S) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        exit(1);
+    }
+    
+    // Initialize matrices with random data
+    printf("Initializing matrices with random data...\n");
+    srand(time(NULL));
+    init_matrix_i2(X, config.n, config.nc);
+    init_matrix_i8(Y, config.n, config.nr);
+    memset(S, 0, config.nr * config.nc * sizeof(float));
+    
+    // Warmup
+    printf("Running %d warmup iterations...\n", config.warmup);
+    for (int i = 0; i < config.warmup; i++) {
+        ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc);
+    }
+    
+    // Benchmark
+    printf("Running %d benchmark iterations...\n", config.iterations);
+    double total_time = 0.0;
+    double min_time = 1e20;
+    double max_time = 0.0;
+    
+    for (int i = 0; i < config.iterations; i++) {
+        double start = get_time_ns();
+        ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc);
+        double end = get_time_ns();
+        
+        double elapsed = end - start;
+        total_time += elapsed;
+        if (elapsed < min_time) min_time = elapsed;
+        if (elapsed > max_time) max_time = elapsed;
+        
+        if ((i + 1) % 100 == 0) {
+            printf("  Progress: %d/%d iterations\n", i + 1, config.iterations);
+        }
+    }
+    
+    // Calculate statistics
+    double avg_time_ns = total_time / config.iterations;
+    double avg_time_ms = avg_time_ns / 1e6;
+    double min_time_ms = min_time / 1e6;
+    double max_time_ms = max_time / 1e6;
+    
+    // Calculate GFLOPS
+    // For GEMM: nr x nc x n multiply-adds = 2 * nr * nc * n FLOPs
+    double flops = 2.0 * config.nr * config.nc * config.n;
+    double gflops = (flops / avg_time_ns);
+    
+    // Calculate throughput (tokens/s assuming each column is a token)
+    double throughput = (config.nc * 1e9) / avg_time_ns;
+    
+    // Print results
+    printf("\n");
+    printf("=" "=%.78s\n", "===============================================================================");
+    printf("Benchmark Results:\n");
+    printf("=" "=%.78s\n", "===============================================================================");
+    printf("  Average time  : %.3f ms\n", avg_time_ms);
+    printf("  Min time      : %.3f ms\n", min_time_ms);
+    printf("  Max time      : %.3f ms\n", max_time_ms);
+    printf("  Std dev       : %.3f ms\n", sqrt((max_time_ms - min_time_ms) * (max_time_ms - min_time_ms) / 12));
+    printf("\nPerformance:\n");
+    printf("  GFLOPS        : %.2f\n", gflops);
+    printf("  Throughput    : %.2f tokens/s\n", throughput);
+    printf("  Latency/token : %.3f us\n", (avg_time_ms * 1000) / config.nc);
+    printf("=" "=%.78s\n", "===============================================================================");
+    
+    // Cleanup
+    free(X);
+    free(Y);
+    free(S);
+}
+
+void print_usage(const char* program) {
+    printf("Usage: %s [options]\n", program);
+    printf("Options:\n");
+    printf("  -n <size>    Embedding dimension (must be divisible by 4, default: 2048)\n");
+    printf("  -r <rows>    Number of rows in matrix Y (default: 32)\n");
+    printf("  -c <cols>    Number of columns in matrix X (default: 128)\n");
+    printf("  -i <iters>   Number of iterations (default: 1000)\n");
+    printf("  -w <warmup>  Number of warmup iterations (default: 10)\n");
+    printf("  -h           Show this help message\n");
+}
+
+int main(int argc, char** argv) {
+    BenchmarkConfig config = {
+        .n = 2048,
+        .nr = 32,
+        .nc = 128,
+        .iterations = 1000,
+        .warmup = 10
+    };
+    
+    // Parse command line arguments
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-n") == 0 && i + 1 < argc) {
+            config.n = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-r") == 0 && i + 1 < argc) {
+            config.nr = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) {
+            config.nc = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) {
+            config.iterations = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-w") == 0 && i + 1 < argc) {
+            config.warmup = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-h") == 0) {
+            print_usage(argv[0]);
+            return 0;
+        } else {
+            fprintf(stderr, "Unknown option: %s\n", argv[i]);
+            print_usage(argv[0]);
+            return 1;
+        }
+    }
+    
+    // Validate configuration
+    if (config.n % 4 != 0) {
+        fprintf(stderr, "Error: Embedding dimension (-n) must be divisible by 4\n");
+        return 1;
+    }
+    
+    if (config.n <= 0 || config.nr <= 0 || config.nc <= 0 || config.iterations <= 0) {
+        fprintf(stderr, "Error: All size parameters must be positive\n");
+        return 1;
+    }
+    
+    // Run benchmark
+    print_config(config);
+    run_benchmark(config);
+    
+    return 0;
+}
diff --git a/utils/test_parallel_strategy.sh b/utils/test_parallel_strategy.sh
new file mode 100755
index 0000000..44da140
--- /dev/null
+++ b/utils/test_parallel_strategy.sh
@@ -0,0 +1,277 @@
+#!/bin/bash
+
+# Script: Test different GEMM parallel strategy performance
+# Strategies: weight-parallel and no-parallel
+# Thread counts: 1,2,4,8,12,16
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+GEMM_CONFIG="$PROJECT_ROOT/include/gemm-config.h"
+GEMM_CONFIG_BACKUP="$PROJECT_ROOT/include/gemm-config.h.bak"
+BUILD_DIR="$PROJECT_ROOT/build"
+STATS_DIR="$PROJECT_ROOT/stats"
+CSV_FILE="$STATS_DIR/test_parallel_strategy_benchmark.csv"
+MODEL_PATH="$PROJECT_ROOT/models/BitNet-b1.58-2B-4T/ggml-model-original.gguf"
+BENCHMARK_CMD="./build/bin/llama-bench"
+THREADS_LIST="1 2 4 8 12 16"
+
+# Color output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check prerequisites
+check_prerequisites() {
+    log_info "Checking prerequisites..."
+    
+    if [ ! -f "$GEMM_CONFIG" ]; then
+        log_error "gemm-config.h not found: $GEMM_CONFIG"
+        exit 1
+    fi
+    
+    if [ ! -f "$MODEL_PATH" ]; then
+        log_error "Model file not found: $MODEL_PATH"
+        exit 1
+    fi
+    
+    if [ ! -d "$BUILD_DIR" ]; then
+        log_error "Build directory not found: $BUILD_DIR"
+        exit 1
+    fi
+    
+    if [ ! -f "$BUILD_DIR/bin/llama-bench" ]; then
+        log_warn "llama-bench executable not found, building..."
+        build_project
+    fi
+    
+    if [ ! -d "$STATS_DIR" ]; then
+        log_info "Creating stats directory..."
+        mkdir -p "$STATS_DIR"
+    fi
+    
+    log_info "Prerequisites check completed"
+}
+
+# Backup original config file
+backup_config() {
+    log_info "Backing up gemm-config.h..."
+    cp "$GEMM_CONFIG" "$GEMM_CONFIG_BACKUP"
+    log_info "Backup completed: $GEMM_CONFIG_BACKUP"
+}
+
+# Restore original config file
+restore_config() {
+    if [ -f "$GEMM_CONFIG_BACKUP" ]; then
+        log_info "Restoring original gemm-config.h..."
+        cp "$GEMM_CONFIG_BACKUP" "$GEMM_CONFIG"
+        rm "$GEMM_CONFIG_BACKUP"
+        log_info "Restore completed"
+    else
+        log_warn "Backup file not found, skipping restore"
+    fi
+}
+
+# Set activation-parallel configuration (keep original ACT_PARALLEL)
+set_activation_parallel() {
+    log_info "Configuration: activation-parallel (keeping #define ACT_PARALLEL)"
+    log_info "Configuration completed"
+}
+
+# Set weight-parallel configuration (remove ACT_PARALLEL)
+set_weight_parallel() {
+    log_info "Configuration: weight-parallel (removing #define ACT_PARALLEL)"
+    
+    # Remove ACT_PARALLEL definition
+    sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
+    
+    # Verify modification
+    if grep -q "^#define ACT_PARALLEL" "$GEMM_CONFIG"; then
+        log_error "Failed to remove ACT_PARALLEL"
+        exit 1
+    fi
+    log_info "Configuration completed"
+}
+
+# Set no-parallel configuration (remove ACT_PARALLEL + modify SIZE to 1)
+set_no_parallel() {
+    log_info "Configuration: no-parallel (removing #define ACT_PARALLEL + modifying SIZE to 1)"
+    
+    # Remove ACT_PARALLEL definition
+    sed -i '/#define ACT_PARALLEL/d' "$GEMM_CONFIG"
+    
+    # Modify all ROW_BLOCK_SIZE and COL_BLOCK_SIZE to 1
+    sed -i 's/#define ROW_BLOCK_SIZE [0-9]\+/#define ROW_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
+    sed -i 's/#define COL_BLOCK_SIZE [0-9]\+/#define COL_BLOCK_SIZE 1/g' "$GEMM_CONFIG"
+    
+    log_info "Configuration completed"
+}
+
+# Build project
+build_project() {
+    log_info "Building project..."
+    cd "$PROJECT_ROOT"
+    
+    if [ ! -f "$BUILD_DIR/Makefile" ]; then
+        log_info "First build, running cmake..."
+        cmake -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=Release > /dev/null 2>&1
+    fi
+    
+    cd "$BUILD_DIR"
+    make -j$(nproc) llama-bench > /dev/null 2>&1
+    
+    if [ ! -f "./bin/llama-bench" ]; then
+        log_error "Build failed"
+        exit 1
+    fi
+    
+    log_info "Build completed"
+    cd "$PROJECT_ROOT"
+}
+
+# Run benchmark test
+run_benchmark() {
+    local strategy=$1
+    local threads=$2
+    
+    cd "$PROJECT_ROOT"
+    
+    # Run llama-bench
+    local output=$($BENCHMARK_CMD -m "$MODEL_PATH" -p 128 -n 0 -t "$threads" -ngl 0 2>&1)
+    
+    # Extract line containing "pp128"
+    local line=$(echo "$output" | grep "pp128" | tail -1)
+    
+    if [ -z "$line" ]; then
+        return 1
+    fi
+    
+    echo "$line"
+}
+
+# Extract throughput value from benchmark output
+extract_throughput() {
+    local line=$1
+    
+    # Remove any leading/trailing whitespace and log messages
+    # The line format is: | model | size | params | backend | threads | test | throughput |
+    # We need to extract the last field which contains the throughput in format "XXX.XX ± YY.YY"
+    local throughput=$(echo "$line" | awk -F'|' '{print $NF}' | xargs | sed 's/\[.*\]//' | xargs)
+    
+    echo "$throughput"
+}
+
+# Initialize CSV file
+init_csv() {
+    log_info "Initializing CSV file: $CSV_FILE"
+    
+    cat > "$CSV_FILE" << 'EOF'
+Strategy,Threads,Throughput
+EOF
+    
+    log_info "CSV file created"
+}
+
+# Add result to CSV
+add_to_csv() {
+    local strategy=$1
+    local threads=$2
+    local throughput=$3
+    
+    echo "$strategy,$threads,$throughput" >> "$CSV_FILE"
+}
+
+# Main function
+main() {
+    log_info "Starting GEMM parallel strategy benchmark tests"
+    log_info "================================================"
+    
+    # Check prerequisites
+    check_prerequisites
+    
+    # Backup original configuration
+    backup_config
+    
+    # Initialize CSV file
+    init_csv
+    
+    # Define strategies to test
+    local strategies=("activation-parallel" "weight-parallel" "no-parallel")
+    
+    for strategy in "${strategies[@]}"; do
+        log_info "================================================"
+        log_info "Testing strategy: $strategy"
+        log_info "================================================"
+        
+        # Restore to original configuration
+        restore_config
+        backup_config
+        
+        # Apply configuration based on strategy
+        case $strategy in
+            activation-parallel)
+                set_activation_parallel
+                ;;
+            weight-parallel)
+                set_weight_parallel
+                ;;
+            no-parallel)
+                set_no_parallel
+                ;;
+        esac
+        
+        # Rebuild project to apply new configuration
+        log_info "Rebuilding project to apply new configuration..."
+        build_project
+        
+        # Run test for each thread count
+        for threads in $THREADS_LIST; do
+            log_info ""
+            log_info "Strategy: $strategy, Threads: $threads"
+            
+            # Run test (capture only output, not log messages)
+            local result=$(run_benchmark "$strategy" "$threads")
+            local test_status=$?
+            
+            if [ $test_status -eq 0 ]; then
+                # Extract throughput value from the result line
+                local throughput=$(extract_throughput "$result")
+                log_info "Throughput: $throughput"
+                
+                # Add to CSV
+                add_to_csv "$strategy" "$threads" "$throughput"
+            else
+                log_warn "Test failed for strategy $strategy, threads $threads"
+            fi
+            
+            sleep 2  # Give system time to cool down
+        done
+    done
+    
+    # Restore original configuration
+    restore_config
+    
+    log_info "================================================"
+    log_info "Test completed!"
+    log_info "Results saved to: $CSV_FILE"
+    log_info "================================================"
+    
+    # Display CSV content
+    log_info "CSV file content:"
+    cat "$CSV_FILE"
+}
+
+# Run main function
+main "$@"
diff --git a/utils/test_perplexity.py b/utils/test_perplexity.py
new file mode 100644
index 0000000..f2d9788
--- /dev/null
+++ b/utils/test_perplexity.py
@@ -0,0 +1,608 @@
+#!/usr/bin/env python3
+"""
+Perplexity Test Script
+Tests GGUF model perplexity on multiple datasets using llama-perplexity.
+"""
+
+import os
+import subprocess
+import time
+import csv
+import re
+from datetime import datetime
+from pathlib import Path
+import argparse
+import tempfile
+import shutil
+import statistics
+
+
+class PerplexityTester:
+    def __init__(self, model_path, llama_perplexity_bin="../build/bin/llama-perplexity", 
+                 data_dir="../data", output_dir="perplexity_results", quick_mode=False,
+                 quantize_bin="../build/bin/llama-quantize", test_embeddings=False, csv_output=None):
+        self.model_path = Path(model_path)
+        self.llama_perplexity_bin = Path(llama_perplexity_bin)
+        self.quantize_bin = Path(quantize_bin)
+        self.data_dir = Path(data_dir)
+        self.output_dir = Path(output_dir)
+        self.quick_mode = quick_mode
+        self.test_embeddings = test_embeddings
+        self.csv_output = Path(csv_output) if csv_output else None
+        self.results = []
+        self.created_models = set()  # Track newly created model files
+        self.temp_files = []  # Track temporary files for cleanup
+        
+        # Embedding types to test
+        self.embedding_types = [
+            ('F32', 'f32'),
+            ('F16', 'f16'),
+            ('Q8_0', 'q8_0'),
+            ('Q6_K', 'q6_k'),
+            ('Q5_0', 'q5_0'),
+            ('Q4_0', 'q4_0'),
+            ('Q3_K', 'q3_k'),
+            ('TQ2_0', 'tq2_0'),
+        ]
+        
+        # Create output directory
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Verify llama-perplexity binary exists
+        if not self.llama_perplexity_bin.exists():
+            raise FileNotFoundError(f"llama-perplexity binary not found: {self.llama_perplexity_bin}")
+        
+        # Verify quantize binary exists if testing embeddings
+        if self.test_embeddings and not self.quantize_bin.exists():
+            raise FileNotFoundError(f"llama-quantize binary not found: {self.quantize_bin}")
+        
+        # Verify model file exists
+        if not self.model_path.exists():
+            raise FileNotFoundError(f"Model file not found: {self.model_path}")
+    
+    def find_datasets(self):
+        """Find all test.txt files in dataset directories."""
+        datasets = []
+        
+        if not self.data_dir.exists():
+            print(f"❌ Data directory not found: {self.data_dir}")
+            return datasets
+        
+        print(f"\n🔍 Searching for datasets in {self.data_dir}...")
+        
+        # Look for test.txt files in subdirectories
+        for dataset_dir in sorted(self.data_dir.iterdir()):
+            if dataset_dir.is_dir():
+                test_file = dataset_dir / "test.txt"
+                if test_file.exists():
+                    size_mb = test_file.stat().st_size / (1024 * 1024)
+                    datasets.append({
+                        'name': dataset_dir.name,
+                        'path': test_file,
+                        'size': test_file.stat().st_size,
+                        'size_mb': size_mb
+                    })
+                    print(f"   ✅ {dataset_dir.name:<20} ({size_mb:.2f} MB)")
+                else:
+                    print(f"   ⚠️  {dataset_dir.name:<20} (no test.txt found)")
+        
+        return datasets
+    
+    def create_quick_dataset(self, dataset_path, num_chars=4096):
+        """Create a temporary dataset with only the first N characters for quick testing."""
+        temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8')
+        self.temp_files.append(temp_file.name)
+        
+        try:
+            with open(dataset_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read(num_chars)
+                temp_file.write(content)
+            temp_file.close()
+            return Path(temp_file.name)
+        except Exception as e:
+            print(f"⚠️  Failed to create quick dataset: {e}")
+            temp_file.close()
+            return dataset_path
+    
+    def cleanup_temp_files(self):
+        """Clean up temporary files."""
+        for temp_file in self.temp_files:
+            try:
+                os.unlink(temp_file)
+            except:
+                pass
+        self.temp_files = []
+    
+    def run_perplexity_test(self, dataset_name, dataset_path, threads=16, ctx_size=512, model_override=None):
+        """Run perplexity test on a single dataset."""
+        test_model = model_override if model_override else self.model_path
+        
+        print(f"\n{'='*80}")
+        print(f"📊 Testing on dataset: {dataset_name}")
+        print(f"   File: {dataset_path}")
+        print(f"   Model: {test_model.name}")
+        print(f"{'='*80}")
+        
+        cmd = [
+            str(self.llama_perplexity_bin),
+            "-m", str(test_model),
+            "-f", str(dataset_path),
+            "-t", str(threads),
+            "-c", str(ctx_size),
+            "-ngl", "0"  # CPU only
+        ]
+        
+        print(f"💻 Command: {' '.join(cmd)}")
+        print(f"⏱️  Starting test...\n")
+        
+        start_time = time.time()
+        
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=3600,  # 1 hour timeout
+                cwd=os.getcwd()
+            )
+            
+            elapsed_time = time.time() - start_time
+            
+            if result.returncode == 0:
+                # Parse perplexity from output (check both stdout and stderr)
+                combined_output = result.stdout + "\n" + result.stderr
+                ppl = self.parse_perplexity(combined_output)
+                
+                if ppl is not None:
+                    print(f"\n✅ Perplexity: {ppl}")
+                    print(f"⏱️  Time: {elapsed_time:.2f}s ({elapsed_time/60:.2f} min)")
+                    status = "success"
+                else:
+                    print(f"\n⚠️  Test completed but could not parse perplexity")
+                    print(f"Last 500 chars of stdout:")
+                    print(result.stdout[-500:])
+                    print(f"Last 500 chars of stderr:")
+                    print(result.stderr[-500:])
+                    status = "parse_error"
+                    ppl = None
+            else:
+                print(f"\n❌ Test failed with return code {result.returncode}")
+                print(f"Error: {result.stderr[:500]}")
+                status = "failed"
+                ppl = None
+                elapsed_time = time.time() - start_time
+            
+            return {
+                'dataset': dataset_name,
+                'perplexity': ppl,
+                'time': elapsed_time,
+                'status': status,
+                'stdout': result.stdout,
+                'stderr': result.stderr
+            }
+            
+        except subprocess.TimeoutExpired:
+            elapsed_time = time.time() - start_time
+            print(f"\n❌ Timeout after {elapsed_time:.2f}s")
+            return {
+                'dataset': dataset_name,
+                'perplexity': None,
+                'time': elapsed_time,
+                'status': 'timeout',
+                'stdout': '',
+                'stderr': 'Test exceeded 1 hour timeout'
+            }
+        except Exception as e:
+            elapsed_time = time.time() - start_time
+            print(f"\n❌ Error: {e}")
+            return {
+                'dataset': dataset_name,
+                'perplexity': None,
+                'time': elapsed_time,
+                'status': 'error',
+                'stdout': '',
+                'stderr': str(e)
+            }
+    
+    def parse_perplexity(self, output):
+        """Parse perplexity value (mean±std format) from llama-perplexity output."""
+        # First try to match "PPL = mean +/- std" format
+        pattern_with_std = r'PPL\s*=\s*(\d+\.?\d*)\s*\+/-\s*(\d+\.?\d*)'
+        match = re.search(pattern_with_std, output, re.IGNORECASE | re.MULTILINE)
+        if match:
+            try:
+                mean = float(match.group(1))
+                std = float(match.group(2))
+                return f"{mean:.4f}±{std:.4f}"
+            except ValueError:
+                pass
+        
+        # Fallback to patterns without std
+        patterns = [
+            r'Final estimate:\s*PPL\s*=\s*(\d+\.?\d*)',
+            r'Final perplexity:\s*(\d+\.?\d*)',
+            r'PPL\s*=\s*(\d+\.?\d*)',
+            r'PPL:\s*(\d+\.?\d*)',
+            r'perplexity:\s*(\d+\.?\d*)',
+            r'ppl\s*=\s*(\d+\.?\d*)',
+            r'Perplexity:\s*(\d+\.?\d*)',
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, output, re.IGNORECASE | re.MULTILINE)
+            if match:
+                try:
+                    return f"{float(match.group(1)):.4f}"
+                except ValueError:
+                    continue
+        
+        return None
+    
+    def quantize_embedding(self, embedding_type, output_suffix):
+        """
+        Quantize model with specific embedding type.
+        
+        Args:
+            embedding_type: Token embedding type (uppercase, e.g., 'Q6_K')
+            output_suffix: Output file suffix (lowercase, e.g., 'q6_k')
+        
+        Returns:
+            Path to quantized model or None if failed
+        """
+        # Construct output path
+        model_dir = self.model_path.parent
+        output_path = model_dir / f"ggml-model-i2_s-embed-{output_suffix}.gguf"
+        
+        # Check if file already exists
+        file_existed = output_path.exists()
+        
+        if file_existed:
+            print(f"ℹ️  Model already exists: {output_path.name}")
+            return output_path
+        
+        cmd = [
+            str(self.quantize_bin),
+            "--token-embedding-type", embedding_type,
+            str(self.model_path),
+            str(output_path),
+            "I2_S",
+            "1",
+            "1"
+        ]
+        
+        print(f"\n{'='*80}")
+        print(f"🔄 Quantizing with embedding type: {embedding_type}")
+        print(f"📥 Input:  {self.model_path.name}")
+        print(f"📤 Output: {output_path.name}")
+        print(f"💻 Command: {' '.join(cmd)}")
+        print(f"{'='*80}\n")
+        
+        start_time = time.time()
+        
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                cwd=os.getcwd(),
+                timeout=600  # 10 minutes timeout
+            )
+            
+            duration = time.time() - start_time
+            
+            if result.returncode == 0:
+                file_size_mb = output_path.stat().st_size / (1024 * 1024)
+                print(f"✅ Quantization successful!")
+                print(f"   Duration: {duration:.2f}s")
+                print(f"   Size: {file_size_mb:.2f} MB")
+                
+                # Mark as newly created
+                self.created_models.add(output_path)
+                return output_path
+            else:
+                print(f"❌ Quantization failed with return code {result.returncode}")
+                print(f"Error: {result.stderr[:500]}")
+                return None
+                
+        except subprocess.TimeoutExpired:
+            print(f"❌ Quantization timeout (exceeded 10 minutes)")
+            return None
+        except Exception as e:
+            print(f"❌ Quantization error: {e}")
+            return None
+    
+    def cleanup_model(self, model_path):
+        """Delete model file if it was created during this session."""
+        if model_path in self.created_models:
+            try:
+                model_path.unlink()
+                print(f"🗑️  Deleted: {model_path.name}")
+                self.created_models.remove(model_path)
+            except Exception as e:
+                print(f"⚠️  Failed to delete {model_path.name}: {e}")
+        else:
+            print(f"ℹ️  Keeping existing file: {model_path.name}")
+    
+    def run_all_tests(self, threads=16, ctx_size=512):
+        """Run perplexity tests on all datasets."""
+        datasets = self.find_datasets()
+        
+        if not datasets:
+            print(f"\n❌ No datasets found in {self.data_dir}")
+            print(f"   Make sure each dataset directory has a test.txt file")
+            return
+        
+        # Quick mode: test all datasets but only first 4096 chars with smaller context
+        if self.quick_mode:
+            ctx_size = min(ctx_size, 128)  # Use smaller context in quick mode
+            print(f"\n⚡ QUICK TEST MODE ENABLED")
+            print(f"   - Testing all datasets with first 4096 characters only")
+            print(f"   - Using reduced context size: {ctx_size}")
+        
+        # Determine models to test
+        if self.test_embeddings:
+            print(f"\n{'='*80}")
+            print(f"🧪 EMBEDDING QUANTIZATION TEST MODE")
+            print(f"{'='*80}")
+            print(f"📦 Base model: {self.model_path.name}")
+            print(f"🔢 Embedding types to test: {len(self.embedding_types)}")
+            print(f"📊 Datasets: {len(datasets)}")
+            print(f"🧵 Threads: {threads}")
+            print(f"📏 Context size: {ctx_size}")
+            print(f"{'='*80}")
+            
+            total_start = time.time()
+            
+            # Test each embedding type
+            for i, (embedding_type, output_suffix) in enumerate(self.embedding_types, 1):
+                print(f"\n\n{'#'*80}")
+                print(f"[{i}/{len(self.embedding_types)}] Testing embedding type: {output_suffix} ({embedding_type})")
+                print(f"{'#'*80}")
+                
+                # Quantize model
+                quantized_model = self.quantize_embedding(embedding_type, output_suffix)
+                
+                if quantized_model is None:
+                    print(f"⚠️  Skipping tests for {output_suffix} due to quantization failure")
+                    continue
+                
+                # Test on all datasets
+                for j, dataset in enumerate(datasets, 1):
+                    print(f"\n[{j}/{len(datasets)}] Testing {dataset['name']} with {output_suffix}...")
+                    
+                    # Use quick dataset if in quick mode
+                    test_path = dataset['path']
+                    if self.quick_mode:
+                        test_path = self.create_quick_dataset(dataset['path'])
+                    
+                    result = self.run_perplexity_test(
+                        f"{dataset['name']}_embed-{output_suffix}",
+                        test_path,
+                        threads,
+                        ctx_size,
+                        model_override=quantized_model
+                    )
+                    self.results.append(result)
+                
+                # Cleanup model after testing
+                print(f"\n🧹 Cleaning up {output_suffix} model...")
+                self.cleanup_model(quantized_model)
+                
+                print(f"\n{'#'*80}")
+                print(f"✅ Completed {output_suffix}")
+                print(f"{'#'*80}")
+            
+            total_time = time.time() - total_start
+            
+        else:
+            # Regular single model test
+            print(f"\n{'='*80}")
+            print(f"🚀 PERPLEXITY TEST SESSION{' (QUICK MODE)' if self.quick_mode else ''}")
+            print(f"{'='*80}")
+            print(f"📦 Model: {self.model_path.name}")
+            print(f"📁 Model path: {self.model_path}")
+            print(f"📊 Datasets {'to test' if self.quick_mode else 'found'}: {len(datasets)}")
+            print(f"🧵 Threads: {threads}")
+            print(f"📏 Context size: {ctx_size}")
+            print(f"{'='*80}")
+            
+            total_start = time.time()
+            
+            # Run tests
+            for i, dataset in enumerate(datasets, 1):
+                print(f"\n\n[{i}/{len(datasets)}] Processing {dataset['name']}...")
+                
+                # Use quick dataset if in quick mode
+                test_path = dataset['path']
+                if self.quick_mode:
+                    test_path = self.create_quick_dataset(dataset['path'])
+                
+                result = self.run_perplexity_test(
+                    dataset['name'],
+                    test_path,
+                    threads,
+                    ctx_size
+                )
+                self.results.append(result)
+            
+            total_time = time.time() - total_start
+        
+        # Clean up temporary files
+        if self.quick_mode:
+            print(f"\n🧹 Cleaning up temporary files...")
+            self.cleanup_temp_files()
+        
+        # Save results
+        self.save_results()
+        
+        # Print summary
+        self.print_summary(total_time)
+    
+    def save_results(self):
+        """Save results to CSV file."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        model_name = self.model_path.stem
+        
+        # Use custom CSV path if provided
+        if self.csv_output:
+            csv_file = self.csv_output
+            # Create parent directory if needed
+            csv_file.parent.mkdir(parents=True, exist_ok=True)
+        else:
+            csv_file = self.output_dir / f"ppl_{model_name}_{timestamp}.csv"
+        
+        print(f"\n💾 Saving results...")
+        
+        with open(csv_file, 'w', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=['dataset', 'perplexity', 'time_seconds', 'status'])
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'dataset': result['dataset'],
+                    'perplexity': result['perplexity'] if result['perplexity'] is not None else 'N/A',
+                    'time_seconds': f"{result['time']:.2f}",
+                    'status': result['status']
+                })
+        
+        print(f"   ✅ CSV saved: {csv_file}")
+        
+        # Save detailed log
+        log_file = self.output_dir / f"ppl_{model_name}_{timestamp}.log"
+        with open(log_file, 'w') as f:
+            f.write(f"Perplexity Test Results\n")
+            f.write(f"{'='*80}\n")
+            f.write(f"Model: {self.model_path}\n")
+            f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"{'='*80}\n\n")
+            
+            for result in self.results:
+                f.write(f"\n{'='*80}\n")
+                f.write(f"Dataset: {result['dataset']}\n")
+                f.write(f"Perplexity: {result['perplexity']}\n")
+                f.write(f"Time: {result['time']:.2f}s\n")
+                f.write(f"Status: {result['status']}\n")
+                f.write(f"\nOutput:\n{result['stdout']}\n")
+                if result['stderr']:
+                    f.write(f"\nErrors:\n{result['stderr']}\n")
+        
+        print(f"   ✅ Log saved: {log_file}")
+    
+    def print_summary(self, total_time):
+        """Print summary of all tests."""
+        print(f"\n\n{'='*80}")
+        print(f"📊 TEST SUMMARY")
+        print(f"{'='*80}\n")
+        
+        # Sort results by perplexity (lower is better)
+        successful = [r for r in self.results if r['perplexity'] is not None]
+        failed = [r for r in self.results if r['perplexity'] is None]
+        
+        if successful:
+            # Extract numeric value from "mean±std" format for sorting
+            def get_ppl_value(result):
+                ppl = result['perplexity']
+                if isinstance(ppl, str) and '±' in ppl:
+                    return float(ppl.split('±')[0])
+                elif isinstance(ppl, str):
+                    try:
+                        return float(ppl)
+                    except ValueError:
+                        return float('inf')
+                return ppl
+            
+            successful_sorted = sorted(successful, key=get_ppl_value)
+            
+            print(f"{'Dataset':<20} {'Perplexity':>20} {'Time (s)':>12} {'Status':<15}")
+            print(f"{'-'*80}")
+            
+            for result in successful_sorted:
+                ppl_str = str(result['perplexity']) if result['perplexity'] is not None else 'N/A'
+                print(f"{result['dataset']:<20} {ppl_str:>20} "
+                      f"{result['time']:>12.2f} {result['status']:<15}")
+            
+            best_ppl = str(successful_sorted[0]['perplexity'])
+            print(f"\n🏆 Best result: {successful_sorted[0]['dataset']} "
+                  f"(PPL: {best_ppl})")
+        
+        if failed:
+            print(f"\n❌ Failed tests ({len(failed)}):")
+            for result in failed:
+                print(f"   - {result['dataset']}: {result['status']}")
+        
+        print(f"\n{'='*80}")
+        print(f"✅ Completed: {len(successful)}/{len(self.results)}")
+        print(f"⏱️  Total time: {total_time:.2f}s ({total_time/60:.2f} min)")
+        print(f"📁 Results saved in: {self.output_dir}")
+        print(f"{'='*80}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Test model perplexity on multiple datasets')
+    parser.add_argument('--model', '-m',
+                        required=True,
+                        help='Path to GGUF model file')
+    parser.add_argument('--data-dir', '-d',
+                        default='data',
+                        help='Directory containing dataset folders (default: data)')
+    parser.add_argument('--threads', '-t',
+                        type=int,
+                        default=16,
+                        help='Number of threads (default: 16)')
+    parser.add_argument('--ctx-size', '-c',
+                        type=int,
+                        default=512,
+                        help='Context size (default: 512)')
+    parser.add_argument('--output-dir', '-o',
+                        default='perplexity_results',
+                        help='Output directory for results (default: perplexity_results)')
+    parser.add_argument('--llama-perplexity',
+                        default='./build/bin/llama-perplexity',
+                        help='Path to llama-perplexity binary (default: ./build/bin/llama-perplexity)')
+    parser.add_argument('--quick', '-q',
+                        action='store_true',
+                        help='Quick test mode: test all datasets with first 4096 characters and reduced context size (128)')
+    parser.add_argument('--test-embeddings', '-e',
+                        action='store_true',
+                        help='Test different embedding quantization types (f32, f16, q8_0, q6_k, q5_0, q4_0, q3_k, tq2_0)')
+    parser.add_argument('--csv-output',
+                        help='Custom path for CSV output file (e.g., results/my_ppl_results.csv)')
+    parser.add_argument('--quantize-bin',
+                        default='./build/bin/llama-quantize',
+                        help='Path to llama-quantize binary (default: ./build/bin/llama-quantize)')
+    
+    args = parser.parse_args()
+    
+    try:
+        tester = PerplexityTester(
+            model_path=args.model,
+            llama_perplexity_bin=args.llama_perplexity,
+            data_dir=args.data_dir,
+            output_dir=args.output_dir,
+            quick_mode=args.quick,
+            quantize_bin=args.quantize_bin,
+            test_embeddings=args.test_embeddings,
+            csv_output=args.csv_output
+        )
+        
+        tester.run_all_tests(
+            threads=args.threads,
+            ctx_size=args.ctx_size
+        )
+        
+    except FileNotFoundError as e:
+        print(f"❌ Error: {e}")
+        return 1
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Test interrupted by user")
+        return 1
+    except Exception as e:
+        print(f"\n❌ Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/utils/test_typical_shapes.sh b/utils/test_typical_shapes.sh
new file mode 100755
index 0000000..6ad805c
--- /dev/null
+++ b/utils/test_typical_shapes.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+# Test typical matrix shapes for BitNet-2B model
+# Based on BitNet-b1.58-2B-4T architecture
+
+echo "=========================================="
+echo "BitNet-2B Typical Shapes Performance Test"
+echo "=========================================="
+echo ""
+
+ITERATIONS=1000
+BENCHMARK="../build/test_gemm_kernel"
+
+# Create stats directory if not exists
+mkdir -p ../stats
+
+# Generate output CSV filename
+CSV_FILE="../stats/gemm_kernel_test_noparal.csv"
+
+# Write CSV header
+echo "test_name,n,nr,nc,time_ms,gflops,throughput_tokens_per_sec" > "$CSV_FILE"
+echo "Results will be saved to: $CSV_FILE"
+echo ""
+
+# Function to extract metrics and append to CSV
+extract_and_save() {
+    local test_name="$1"
+    local output="$2"
+    
+    # Extract values using grep and awk
+    local n=$(echo "$output" | grep "Embedding dimension" | awk '{print $5}')
+    local nr=$(echo "$output" | grep "Matrix Y rows" | awk '{print $6}')
+    local nc=$(echo "$output" | grep "Matrix X columns" | awk '{print $6}')
+    local avg_time=$(echo "$output" | grep "Average time" | awk '{print $4}')
+    local min_time=$(echo "$output" | grep "Min time" | awk '{print $4}')
+    local max_time=$(echo "$output" | grep "Max time" | awk '{print $4}')
+    local gflops=$(echo "$output" | grep "GFLOPS" | awk '{print $3}')
+    local throughput=$(echo "$output" | grep "Throughput" | awk '{print $3}')
+    
+    # Calculate standard deviation estimate from range (assuming ~95% of data within min-max)
+    # For normal distribution, range ≈ 4*std, so std ≈ range/4
+    local std_time=$(echo "scale=4; ($max_time - $min_time) / 4" | bc)
+    
+    # Format as mean±std
+    local time_formatted="${avg_time}±${std_time}"
+    
+    # For GFLOPS and throughput, we don't have std info, so just use the value
+    # If you want to estimate std for these as well, you would need more data
+    
+    # Append to CSV
+    echo "${test_name},${n},${nr},${nc},${time_formatted},${gflops},${throughput}" >> "$CSV_FILE"
+}
+
+echo "Test 1: Single Token Generation (Attention QKV projection)"
+echo "  Scenario: Generating 1 token at a time"
+echo "  Shape: n=2048, r=1, c=2048"
+OUTPUT=$($BENCHMARK -n 2048 -r 1 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "single_token_gen" "$OUTPUT"
+echo ""
+
+echo "Test 2: Small Batch Prompt Processing (Attention QKV projection)"
+echo "  Scenario: Processing prompt with 128 tokens, batch size 1"
+echo "  Shape: n=2048, r=128, c=2048"
+OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "small_batch_prompt" "$OUTPUT"
+echo ""
+
+echo "Test 3: Medium Batch Prompt Processing (Attention QKV projection)"
+echo "  Scenario: Processing prompt with 256 tokens or batch of 256"
+echo "  Shape: n=2048, r=256, c=2048"
+OUTPUT=$($BENCHMARK -n 2048 -r 256 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "medium_batch_prompt" "$OUTPUT"
+echo ""
+
+echo "Test 4: Large Batch Processing (Attention QKV projection)"
+echo "  Scenario: Processing 512 tokens or batch of 512"
+echo "  Shape: n=2048, r=512, c=2048"
+OUTPUT=$($BENCHMARK -n 2048 -r 512 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "large_batch_prompt" "$OUTPUT"
+echo ""
+
+echo "Test 5: FFN Up-projection (Small batch)"
+echo "  Scenario: Feed-forward network expansion, 128 tokens"
+echo "  Shape: n=2048, r=128, c=8192"
+OUTPUT=$($BENCHMARK -n 2048 -r 128 -c 8192 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "ffn_up_projection" "$OUTPUT"
+echo ""
+
+echo "Test 6: FFN Down-projection (Small batch)"
+echo "  Scenario: Feed-forward network reduction, 128 tokens"
+echo "  Shape: n=8192, r=128, c=2048"
+OUTPUT=$($BENCHMARK -n 8192 -r 128 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "ffn_down_projection" "$OUTPUT"
+echo ""
+
+echo "Test 7: Long Context Processing"
+echo "  Scenario: Processing very long context (2048 tokens)"
+echo "  Shape: n=2048, r=2048, c=2048"
+OUTPUT=$($BENCHMARK -n 2048 -r 2048 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "long_context" "$OUTPUT"
+echo ""
+
+echo "Test 8: Batched Token Generation"
+echo "  Scenario: Generating tokens for 32 sequences simultaneously"
+echo "  Shape: n=2048, r=32, c=2048"
+OUTPUT=$($BENCHMARK -n 2048 -r 32 -c 2048 -i $ITERATIONS 2>&1)
+echo "$OUTPUT"
+extract_and_save "batched_token_gen" "$OUTPUT"
+echo ""
+
+echo "=========================================="
+echo "All tests completed!"
+echo "Results saved to: $CSV_FILE"
+echo "=========================================="
diff --git a/utils/tune_gemm_config.py b/utils/tune_gemm_config.py
new file mode 100644
index 0000000..83b4218
--- /dev/null
+++ b/utils/tune_gemm_config.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+"""
+GEMM Configuration Tuning Script
+This script automatically tunes ROW_BLOCK_SIZE, COL_BLOCK_SIZE, and PARALLEL_SIZE
+to find the optimal configuration for maximum throughput (t/s).
+"""
+
+import subprocess
+import os
+import re
+import csv
+import shutil
+from datetime import datetime
+from pathlib import Path
+import argparse
+
+
+class GemmTuner:
+    def __init__(self, config_path, model_path, threads=16):
+        self.config_path = Path(config_path)
+        self.model_path = model_path
+        self.threads = threads
+        self.backup_path = self.config_path.parent / f"gemm-config.h.backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        self.build_dir = Path("../build")
+        self.results = []
+        
+    def backup_config(self):
+        """Backup current configuration file"""
+        print(f"📦 Backing up current config to {self.backup_path}")
+        shutil.copy2(self.config_path, self.backup_path)
+        
+    def restore_config(self):
+        """Restore original configuration file"""
+        print(f"♻️  Restoring original config from {self.backup_path}")
+        shutil.copy2(self.backup_path, self.config_path)
+        
+    def generate_config(self, act_parallel, row_block_size, col_block_size, parallel_size):
+        """Generate new configuration file"""
+        content = ""
+        
+        # ACT_PARALLEL definition
+        if act_parallel:
+            content += "#define ACT_PARALLEL\n"
+        else:
+            content += "// #define ACT_PARALLEL\n"
+        
+        # Detect architecture branches in original config file
+        with open(self.backup_path, 'r') as f:
+            original = f.read()
+        
+        has_avx = "__AVX__" in original or "__AVX2__" in original
+        has_arm = "__ARM_NEON" in original
+        
+        # If architecture detection exists, generate corresponding branches
+        if has_avx and has_arm:
+            # Multi-architecture configuration
+            content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
+            content += "#if defined(ACT_PARALLEL)\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#else\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#endif\n"
+            content += "#elif defined(__ARM_NEON)\n"
+            content += "#if defined(ACT_PARALLEL)\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#else\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#endif\n"
+            content += "#endif\n"
+        elif has_avx:
+            # AVX architecture only
+            content += "#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)\n"
+            content += "#if defined(ACT_PARALLEL)\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#else\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#endif\n"
+            content += "#endif\n"
+        elif has_arm:
+            # ARM architecture only
+            content += "#if defined(__ARM_NEON)\n"
+            content += "#if defined(ACT_PARALLEL)\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#else\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#endif\n"
+            content += "#endif\n"
+        else:
+            # No architecture detection, define directly
+            content += "#if defined(ACT_PARALLEL)\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#else\n"
+            content += f"    #define ROW_BLOCK_SIZE {row_block_size}\n"
+            content += f"    #define COL_BLOCK_SIZE {col_block_size}\n"
+            content += f"    #define PARALLEL_SIZE {parallel_size}\n"
+            content += "#endif\n"
+        
+        content += "\n"
+        
+        with open(self.config_path, 'w') as f:
+            f.write(content)
+    
+    def rebuild_project(self):
+        """Rebuild project"""
+        print("🔨 Rebuilding project...")
+        result = subprocess.run(
+            ["cmake", "--build", str(self.build_dir), "--target", "llama-bench"],
+            capture_output=True,
+            text=True,
+            cwd=os.getcwd()
+        )
+        if result.returncode != 0:
+            print(f"⚠️  Build warning/error: {result.stderr}")
+            return False
+        return True
+        
+    def run_benchmark(self):
+        """Run benchmark test"""
+        cmd = [
+            f"{self.build_dir}/bin/llama-bench",
+            "-m", self.model_path,
+            "-p", "128",
+            "-n", "0",
+            "-t", str(self.threads),
+            "-ngl", "0"
+        ]
+        
+        print(f"⚡ Running benchmark: {' '.join(cmd)}")
+        
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            cwd=os.getcwd(),
+            timeout=300  # 5分钟超时
+        )
+        
+        if result.returncode != 0:
+            print(f"❌ Benchmark failed: {result.stderr}")
+            return None
+            
+        return result.stdout
+        
+    def parse_throughput(self, output):
+        """Parse pp128 throughput from output"""
+        # 匹配 pp128: |         pp128 |       501.06 ± 11.37 |
+        pp_pattern = r'\|\s+pp128\s+\|\s+([\d.]+)\s+±\s+([\d.]+)\s+\|'
+        pp_match = re.search(pp_pattern, output)
+        
+        if pp_match:
+            pp_throughput = float(pp_match.group(1))
+            pp_std_dev = float(pp_match.group(2))
+            
+            return {
+                'pp_throughput': pp_throughput,
+                'pp_std_dev': pp_std_dev
+            }
+        
+        return None
+        
+    def test_configuration(self, act_parallel, row_block_size, col_block_size, parallel_size):
+        """Test single configuration"""
+        config_name = f"ACT_{'ON' if act_parallel else 'OFF'}_R{row_block_size}_C{col_block_size}_P{parallel_size}"
+        print(f"\n{'='*80}")
+        print(f"🧪 Testing configuration: {config_name}")
+        print(f"   ACT_PARALLEL: {act_parallel}")
+        print(f"   ROW_BLOCK_SIZE: {row_block_size}")
+        print(f"   COL_BLOCK_SIZE: {col_block_size}")
+        print(f"   PARALLEL_SIZE: {parallel_size}")
+        print(f"{'='*80}")
+        
+        # Generate configuration
+        self.generate_config(act_parallel, row_block_size, col_block_size, parallel_size)
+        
+        # Rebuild project
+        if not self.rebuild_project():
+            print("⚠️  Build failed, skipping this configuration")
+            return None
+        
+        # Run benchmark test
+        output = self.run_benchmark()
+        if output is None:
+            return None
+            
+        # Parse results
+        metrics = self.parse_throughput(output)
+        
+        if metrics is not None:
+            result = {
+                'act_parallel': act_parallel,
+                'row_block_size': row_block_size,
+                'col_block_size': col_block_size,
+                'parallel_size': parallel_size,
+                'config_name': config_name,
+                **metrics
+            }
+            self.results.append(result)
+            print(f"✅ PP128: {metrics['pp_throughput']:.2f} ± {metrics['pp_std_dev']:.2f} t/s")
+            return result
+        else:
+            print("❌ Failed to parse throughput")
+            return None
+            
+    def save_results(self, csv_path):
+        """Save results to CSV file"""
+        print(f"\n💾 Saving results to {csv_path}")
+        
+        with open(csv_path, 'w', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=[
+                'config_name', 'act_parallel', 'row_block_size', 
+                'col_block_size', 'parallel_size', 
+                'pp_throughput', 'pp_std_dev'
+            ])
+            writer.writeheader()
+            writer.writerows(self.results)
+            
+    def find_best_config(self):
+        """Find the best configuration with highest throughput"""
+        if not self.results:
+            print("❌ No valid results found")
+            return None
+            
+        best = max(self.results, key=lambda x: x['pp_throughput'])
+        return best
+        
+    def run_tuning(self, configurations, output_csv=None):
+        """Run test for all configurations"""
+        print(f"\n🚀 Starting tuning process with {len(configurations)} configurations")
+        print(f"📊 Model: {self.model_path}")
+        print(f"🧵 Threads: {self.threads}\n")
+        
+        # Backup configuration
+        self.backup_config()
+        
+        try:
+            # Test all configurations
+            for i, config in enumerate(configurations, 1):
+                print(f"\n[{i}/{len(configurations)}]")
+                self.test_configuration(**config)
+                
+            # Save results
+            if output_csv is None:
+                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+                csv_path = f"stats/tuning_results_{timestamp}.csv"
+            else:
+                csv_path = output_csv
+            self.save_results(csv_path)
+            
+            # Find best configuration
+            best = self.find_best_config()
+            if best:
+                print(f"\n{'='*80}")
+                print(f"🏆 BEST CONFIGURATION FOUND!")
+                print(f"{'='*80}")
+                print(f"Configuration: {best['config_name']}")
+                print(f"ACT_PARALLEL: {best['act_parallel']}")
+                print(f"ROW_BLOCK_SIZE: {best['row_block_size']}")
+                print(f"COL_BLOCK_SIZE: {best['col_block_size']}")
+                print(f"PARALLEL_SIZE: {best['parallel_size']}")
+                print(f"PP128 Throughput: {best['pp_throughput']:.2f} ± {best['pp_std_dev']:.2f} t/s")
+                print(f"{'='*80}\n")
+                
+                # Apply best configuration
+                apply = input("Do you want to apply this configuration? (y/n): ").strip().lower()
+                if apply == 'y':
+                    self.generate_config(
+                        best['act_parallel'],
+                        best['row_block_size'],
+                        best['col_block_size'],
+                        best['parallel_size']
+                    )
+                    self.rebuild_project()
+                    print("✅ Best configuration applied!")
+                else:
+                    self.restore_config()
+                    print("✅ Original configuration restored")
+            
+        except KeyboardInterrupt:
+            print("\n⚠️  Tuning interrupted by user")
+            self.restore_config()
+        except Exception as e:
+            print(f"\n❌ Error during tuning: {e}")
+            self.restore_config()
+            raise
+
+
+def generate_configurations():
+    """Generate list of configurations to test"""
+    configurations = []
+    
+    act_parallel_options = [True]
+    
+    row_sizes = [2, 4, 8, 16, 32]
+    col_sizes = [32, 64, 128, 256, 512, 1024]
+    parallelism_degree = [2, 4, 8]
+    
+    for act_parallel in act_parallel_options:
+        for row in row_sizes:
+            for col in col_sizes:
+                for parallel in parallelism_degree:
+                    # Add filtering conditions
+                    if act_parallel:
+                        # When ACT_PARALLEL=True, only calculate combinations with parallel < row
+                        if parallel > row:
+                            continue
+                    else:
+                        # When ACT_PARALLEL=False, only calculate combinations with parallel < col
+                        if parallel > col:
+                            continue
+                    
+                    configurations.append({
+                        'act_parallel': act_parallel,
+                        'row_block_size': row,
+                        'col_block_size': col,
+                        'parallel_size': parallel
+                    })
+    
+    return configurations
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Tune GEMM configuration for optimal performance')
+    parser.add_argument('--config', default='../include/gemm-config.h', 
+                        help='Path to gemm-config.h file')
+    parser.add_argument('--model', default='../models/BitNet-b1.58-2B-4T/ggml-model-i2_s-embed-q6_k.gguf',
+                        help='Path to model file')
+    parser.add_argument('--threads', type=int, default=8,
+                        help='Number of threads to use')
+    parser.add_argument('--quick', action='store_true',
+                        help='Quick test with fewer configurations')
+    parser.add_argument('--custom', action='store_true',
+                        help='Manually specify configurations to test')
+    parser.add_argument('--output', type=str, default=None,
+                        help='Output CSV file path (default: stats/tuning_results_<timestamp>.csv)')
+    
+    args = parser.parse_args()
+    
+    tuner = GemmTuner(args.config, args.model, args.threads)
+    
+    if args.custom:
+        # Custom configuration mode
+        print("Custom configuration mode")
+        configurations = []
+        while True:
+            print("\nEnter configuration (or 'done' to finish):")
+            act = input("ACT_PARALLEL (y/n): ").strip().lower() == 'y'
+            if input == 'done':
+                break
+            row = int(input("ROW_BLOCK_SIZE: "))
+            col = int(input("COL_BLOCK_SIZE: "))
+            par = int(input("PARALLEL_SIZE: "))
+            configurations.append({
+                'act_parallel': act,
+                'row_block_size': row,
+                'col_block_size': col,
+                'parallel_size': par
+            })
+    elif args.quick:
+        # Quick test mode - test only a few key configurations
+        configurations = [
+            {'act_parallel': True, 'row_block_size': 4, 'col_block_size': 128, 'parallel_size': 4},
+            {'act_parallel': True, 'row_block_size': 8, 'col_block_size': 128, 'parallel_size': 4},
+            {'act_parallel': True, 'row_block_size': 4, 'col_block_size': 64, 'parallel_size': 4},
+            {'act_parallel': False, 'row_block_size': 32, 'col_block_size': 4, 'parallel_size': 4},
+            {'act_parallel': False, 'row_block_size': 16, 'col_block_size': 4, 'parallel_size': 4},
+        ]
+    else:
+        # Full test mode
+        configurations = generate_configurations()
+    
+    print(f"\n{'='*80}")
+    print(f"GEMM Configuration Tuner")
+    print(f"{'='*80}")
+    print(f"Total configurations to test: {len(configurations)}")
+    print(f"Estimated time: ~{len(configurations) * 0.5:.1f} minutes (assuming 30s per test)")
+    print(f"{'='*80}\n")
+    
+    proceed = input("Proceed with tuning? (y/n): ").strip().lower()
+    if proceed != 'y':
+        print("Tuning cancelled")
+        return
+    
+    tuner.run_tuning(configurations, output_csv=args.output)
+
+
+if __name__ == "__main__":
+    main()