diff --git a/README.md b/README.md index bfb09a6..2cc2a73 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.5 The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details. -**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.5x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md). +**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.15x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md). performance_comparison diff --git a/assets/performance.png b/assets/performance.png index 03d477d..078fd3f 100644 Binary files a/assets/performance.png and b/assets/performance.png differ diff --git a/src/README.md b/src/README.md index 1a8ef2b..f713b9a 100644 --- a/src/README.md +++ b/src/README.md @@ -153,40 +153,40 @@ Comparison of optimized parallel kernels vs. original implementation: **Test Configuration:** - Model: BitNet-b1.58-2B-4T -- Hardware: AMD EPYC 7V13 64-Core Processor +- Hardware: AMD EPYC 7V13 - Threads: 1 / 2 / 4 / 8 / 12 / 16 - Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128) - Method: Activation Parallel
-x86_performance +performance_comparison_amd_epyc
**Test Configuration:** - Model: BitNet-b1.58-2B-4T -- Hardware: ARM Core -- Threads: 1 / 2 / 4 / 8 +- Hardware: Intel i7-13800H +- Threads: 1 / 2 / 4 / 6 - Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128) -- Method: Activation Parallel with DOTPROD +- Method: Activation Parallel
-arm_dotprod_performance +performance_comparison_i7-13800h
**Test Configuration:** - Model: BitNet-b1.58-2B-4T -- Hardware: ARM Core +- Hardware: Cobalt 100 - Threads: 1 / 2 / 4 / 8 - Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128) -- Method: Activation Parallel without DOTPROD +- Method: Activation Parallel
-arm_no_dotprod_performance +performance_comparison_cobalt100_dotprod
diff --git a/src/assets/performance_arm_dotprod.png b/src/assets/performance_arm_dotprod.png deleted file mode 100644 index 5163b3c..0000000 Binary files a/src/assets/performance_arm_dotprod.png and /dev/null differ diff --git a/src/assets/performance_arm_no_dotprod.png b/src/assets/performance_arm_no_dotprod.png deleted file mode 100644 index da980be..0000000 Binary files a/src/assets/performance_arm_no_dotprod.png and /dev/null differ diff --git a/src/assets/performance_comparison_amd_epyc.png b/src/assets/performance_comparison_amd_epyc.png new file mode 100644 index 0000000..6ebdb3d Binary files /dev/null and b/src/assets/performance_comparison_amd_epyc.png differ diff --git a/src/assets/performance_comparison_cobalt100_dotprod.png b/src/assets/performance_comparison_cobalt100_dotprod.png new file mode 100644 index 0000000..4d0ef8c Binary files /dev/null and b/src/assets/performance_comparison_cobalt100_dotprod.png differ diff --git a/src/assets/performance_comparison_i7-13800h.png b/src/assets/performance_comparison_i7-13800h.png new file mode 100644 index 0000000..e486d66 Binary files /dev/null and b/src/assets/performance_comparison_i7-13800h.png differ diff --git a/src/assets/performance_x86.png b/src/assets/performance_x86.png deleted file mode 100644 index 31a2332..0000000 Binary files a/src/assets/performance_x86.png and /dev/null differ diff --git a/utils/test_power.sh b/utils/test_power.sh new file mode 100755 index 0000000..79a1a68 --- /dev/null +++ b/utils/test_power.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# Monitor power consumption for llama-bench with different thread configurations +# Usage: ./monitor_power.sh +# Example: ./monitor_power.sh models/model.gguf results.csv "1,2,4,8" "1,2,4,8" + +set -e + +# Parse arguments +if [ $# -ne 4 ]; then + echo "Usage: $0 " + echo "Example: $0 models/model.gguf results.csv \"1,2,4,8\" \"1,2,4,8\"" + exit 1 +fi + +MODEL_PATH="$1" +OUTPUT_CSV="$2" +PP_THREADS="$3" +TG_THREADS="$4" + +TEMP_LOG="/tmp/power_monitor_$$.log" +PID_FILE="/tmp/monitor_$$.pid" +BENCH_OUTPUT="/tmp/bench_output_$$.txt" + +# Validate model exists +if [ ! -f "$MODEL_PATH" ]; then + echo "Error: Model file not found: $MODEL_PATH" + exit 1 +fi + +# Create output directory if needed +mkdir -p "$(dirname "$OUTPUT_CSV")" + +# Function to monitor CPU stats +monitor_cpu() { + local log_file="$1" + echo "Timestamp,CPU_Usage(%),Avg_Freq(MHz)" > "$log_file" + while [ -f "$PID_FILE" ]; do + cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100-$8}') + avg_freq=$(grep "cpu MHz" /proc/cpuinfo | awk '{sum+=$4; count++} END {printf "%.0f", sum/count}') + timestamp=$(date +%s.%N) + echo "$timestamp,$cpu_usage,$avg_freq" >> "$log_file" + sleep 0.5 + done +} + +# Function to calculate average power +calculate_power() { + local log_file="$1" + awk -F',' 'NR>1 {sum_cpu+=$2; count++} END { + if (count > 0) { + avg_cpu = sum_cpu/count + est_power = avg_cpu * 200 / 100 + printf "%.2f", est_power + } else { + print "0" + } + }' "$log_file" +} + +# Function to extract throughput from llama-bench output +extract_throughput() { + local bench_output="$1" + local workload="$2" + grep "$workload" "$bench_output" | awk '{ + # Extract mean from "mean ± std" format + for (i=1; i<=NF; i++) { + if ($(i+1) == "±") { + printf "%.2f", $i + exit + } + } + }' +} + +# Function to run single benchmark +run_benchmark() { + local workload="$1" # "pp" or "tg" + local threads="$2" + local n_flag="" + + if [ "$workload" = "pp" ]; then + n_flag="-n 0" + workload_name="pp128" + else + n_flag="-n 128" + workload_name="tg128" + fi + + # Output progress to stderr (won't be captured in CSV) + echo "Testing $workload_name with $threads threads..." >&2 + + # Start monitoring + touch "$PID_FILE" + monitor_cpu "$TEMP_LOG" & + local monitor_pid=$! + + # Run benchmark + ./build/bin/llama-bench -m "$MODEL_PATH" -p 128 $n_flag -t "$threads" -ngl 0 > "$BENCH_OUTPUT" 2>&1 + + # Stop monitoring + rm -f "$PID_FILE" + wait $monitor_pid 2>/dev/null || true + + # Extract results + local throughput=$(extract_throughput "$BENCH_OUTPUT" "$workload_name") + local power=$(calculate_power "$TEMP_LOG") + + if [ -z "$throughput" ] || [ "$throughput" = "0" ]; then + echo "Warning: Failed to extract throughput for $workload_name, threads=$threads" >&2 + throughput="0" + fi + + # Calculate J/t (Joules per token) + local j_per_token=$(awk -v p="$power" -v t="$throughput" 'BEGIN { + if (t > 0) printf "%.4f", p/t; else print "0" + }') + + # Output progress to stderr + echo " Throughput: $throughput t/s, Power: $power W, Energy: $j_per_token J/t" >&2 + + # Only output CSV line to stdout (this will be captured) + echo "$workload_name,$threads,$throughput,$power,$j_per_token" +} + +# Initialize CSV +echo "Workload,Threads,Throughput(t/s),Power(W),Energy(J/t)" > "$OUTPUT_CSV" + +# Test PP workloads +IFS=',' read -ra PP_ARRAY <<< "$PP_THREADS" +for threads in "${PP_ARRAY[@]}"; do + threads=$(echo "$threads" | xargs) # trim whitespace + result=$(run_benchmark "pp" "$threads") + echo "$result" >> "$OUTPUT_CSV" +done + +# Test TG workloads +IFS=',' read -ra TG_ARRAY <<< "$TG_THREADS" +for threads in "${TG_ARRAY[@]}"; do + threads=$(echo "$threads" | xargs) # trim whitespace + result=$(run_benchmark "tg" "$threads") + echo "$result" >> "$OUTPUT_CSV" +done + +# Cleanup +rm -f "$TEMP_LOG" "$BENCH_OUTPUT" "$PID_FILE" + +echo "" +echo "=== Benchmark Complete ===" +echo "Results saved to: $OUTPUT_CSV" +echo "" +cat "$OUTPUT_CSV"