[modify] some test picture and add power test script
@@ -10,7 +10,7 @@ bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.5
|
||||
|
||||
The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.
|
||||
|
||||
**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.5x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md).
|
||||
**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.15x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md).
|
||||
|
||||
<img src="./assets/performance.png" alt="performance_comparison" width="800"/>
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 646 KiB After Width: | Height: | Size: 1.1 MiB |
@@ -153,40 +153,40 @@ Comparison of optimized parallel kernels vs. original implementation:
|
||||
|
||||
**Test Configuration:**
|
||||
- Model: BitNet-b1.58-2B-4T
|
||||
- Hardware: AMD EPYC 7V13 64-Core Processor
|
||||
- Hardware: AMD EPYC 7V13
|
||||
- Threads: 1 / 2 / 4 / 8 / 12 / 16
|
||||
- Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128)
|
||||
- Method: Activation Parallel
|
||||
|
||||
<div align="center">
|
||||
|
||||
<img src="./assets/performance_x86.png" alt="x86_performance" width="800"/>
|
||||
<img src="./assets/performance_comparison_amd_epyc.png" alt="performance_comparison_amd_epyc" width="800"/>
|
||||
|
||||
</div>
|
||||
|
||||
**Test Configuration:**
|
||||
- Model: BitNet-b1.58-2B-4T
|
||||
- Hardware: ARM Core
|
||||
- Threads: 1 / 2 / 4 / 8
|
||||
- Hardware: Intel i7-13800H
|
||||
- Threads: 1 / 2 / 4 / 6
|
||||
- Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128)
|
||||
- Method: Activation Parallel with DOTPROD
|
||||
- Method: Activation Parallel
|
||||
|
||||
<div align="center">
|
||||
|
||||
<img src="./assets/performance_arm_dotprod.png" alt="arm_dotprod_performance" width="800"/>
|
||||
<img src="./assets/performance_comparison_i7-13800h.png" alt="performance_comparison_i7-13800h" width="800"/>
|
||||
|
||||
</div>
|
||||
|
||||
**Test Configuration:**
|
||||
- Model: BitNet-b1.58-2B-4T
|
||||
- Hardware: ARM Core
|
||||
- Hardware: Cobalt 100
|
||||
- Threads: 1 / 2 / 4 / 8
|
||||
- Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128)
|
||||
- Method: Activation Parallel without DOTPROD
|
||||
- Method: Activation Parallel
|
||||
|
||||
<div align="center">
|
||||
|
||||
<img src="./assets/performance_arm_no_dotprod.png" alt="arm_no_dotprod_performance" width="800"/>
|
||||
<img src="./assets/performance_comparison_cobalt100_dotprod.png" alt="performance_comparison_cobalt100_dotprod" width="800"/>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 280 KiB |
|
Before Width: | Height: | Size: 291 KiB |
|
After Width: | Height: | Size: 313 KiB |
|
After Width: | Height: | Size: 290 KiB |
|
After Width: | Height: | Size: 260 KiB |
|
Before Width: | Height: | Size: 315 KiB |
@@ -0,0 +1,151 @@
|
||||
#!/bin/bash
|
||||
# Monitor power consumption for llama-bench with different thread configurations
|
||||
# Usage: ./monitor_power.sh <model_path> <output_csv> <pp_threads> <tg_threads>
|
||||
# Example: ./monitor_power.sh models/model.gguf results.csv "1,2,4,8" "1,2,4,8"
|
||||
|
||||
set -e
|
||||
|
||||
# Parse arguments
|
||||
if [ $# -ne 4 ]; then
|
||||
echo "Usage: $0 <model_path> <output_csv> <pp_threads> <tg_threads>"
|
||||
echo "Example: $0 models/model.gguf results.csv \"1,2,4,8\" \"1,2,4,8\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MODEL_PATH="$1"
|
||||
OUTPUT_CSV="$2"
|
||||
PP_THREADS="$3"
|
||||
TG_THREADS="$4"
|
||||
|
||||
TEMP_LOG="/tmp/power_monitor_$$.log"
|
||||
PID_FILE="/tmp/monitor_$$.pid"
|
||||
BENCH_OUTPUT="/tmp/bench_output_$$.txt"
|
||||
|
||||
# Validate model exists
|
||||
if [ ! -f "$MODEL_PATH" ]; then
|
||||
echo "Error: Model file not found: $MODEL_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create output directory if needed
|
||||
mkdir -p "$(dirname "$OUTPUT_CSV")"
|
||||
|
||||
# Function to monitor CPU stats
|
||||
monitor_cpu() {
|
||||
local log_file="$1"
|
||||
echo "Timestamp,CPU_Usage(%),Avg_Freq(MHz)" > "$log_file"
|
||||
while [ -f "$PID_FILE" ]; do
|
||||
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100-$8}')
|
||||
avg_freq=$(grep "cpu MHz" /proc/cpuinfo | awk '{sum+=$4; count++} END {printf "%.0f", sum/count}')
|
||||
timestamp=$(date +%s.%N)
|
||||
echo "$timestamp,$cpu_usage,$avg_freq" >> "$log_file"
|
||||
sleep 0.5
|
||||
done
|
||||
}
|
||||
|
||||
# Function to calculate average power
|
||||
calculate_power() {
|
||||
local log_file="$1"
|
||||
awk -F',' 'NR>1 {sum_cpu+=$2; count++} END {
|
||||
if (count > 0) {
|
||||
avg_cpu = sum_cpu/count
|
||||
est_power = avg_cpu * 200 / 100
|
||||
printf "%.2f", est_power
|
||||
} else {
|
||||
print "0"
|
||||
}
|
||||
}' "$log_file"
|
||||
}
|
||||
|
||||
# Function to extract throughput from llama-bench output
|
||||
extract_throughput() {
|
||||
local bench_output="$1"
|
||||
local workload="$2"
|
||||
grep "$workload" "$bench_output" | awk '{
|
||||
# Extract mean from "mean ± std" format
|
||||
for (i=1; i<=NF; i++) {
|
||||
if ($(i+1) == "±") {
|
||||
printf "%.2f", $i
|
||||
exit
|
||||
}
|
||||
}
|
||||
}'
|
||||
}
|
||||
|
||||
# Function to run single benchmark
|
||||
run_benchmark() {
|
||||
local workload="$1" # "pp" or "tg"
|
||||
local threads="$2"
|
||||
local n_flag=""
|
||||
|
||||
if [ "$workload" = "pp" ]; then
|
||||
n_flag="-n 0"
|
||||
workload_name="pp128"
|
||||
else
|
||||
n_flag="-n 128"
|
||||
workload_name="tg128"
|
||||
fi
|
||||
|
||||
# Output progress to stderr (won't be captured in CSV)
|
||||
echo "Testing $workload_name with $threads threads..." >&2
|
||||
|
||||
# Start monitoring
|
||||
touch "$PID_FILE"
|
||||
monitor_cpu "$TEMP_LOG" &
|
||||
local monitor_pid=$!
|
||||
|
||||
# Run benchmark
|
||||
./build/bin/llama-bench -m "$MODEL_PATH" -p 128 $n_flag -t "$threads" -ngl 0 > "$BENCH_OUTPUT" 2>&1
|
||||
|
||||
# Stop monitoring
|
||||
rm -f "$PID_FILE"
|
||||
wait $monitor_pid 2>/dev/null || true
|
||||
|
||||
# Extract results
|
||||
local throughput=$(extract_throughput "$BENCH_OUTPUT" "$workload_name")
|
||||
local power=$(calculate_power "$TEMP_LOG")
|
||||
|
||||
if [ -z "$throughput" ] || [ "$throughput" = "0" ]; then
|
||||
echo "Warning: Failed to extract throughput for $workload_name, threads=$threads" >&2
|
||||
throughput="0"
|
||||
fi
|
||||
|
||||
# Calculate J/t (Joules per token)
|
||||
local j_per_token=$(awk -v p="$power" -v t="$throughput" 'BEGIN {
|
||||
if (t > 0) printf "%.4f", p/t; else print "0"
|
||||
}')
|
||||
|
||||
# Output progress to stderr
|
||||
echo " Throughput: $throughput t/s, Power: $power W, Energy: $j_per_token J/t" >&2
|
||||
|
||||
# Only output CSV line to stdout (this will be captured)
|
||||
echo "$workload_name,$threads,$throughput,$power,$j_per_token"
|
||||
}
|
||||
|
||||
# Initialize CSV
|
||||
echo "Workload,Threads,Throughput(t/s),Power(W),Energy(J/t)" > "$OUTPUT_CSV"
|
||||
|
||||
# Test PP workloads
|
||||
IFS=',' read -ra PP_ARRAY <<< "$PP_THREADS"
|
||||
for threads in "${PP_ARRAY[@]}"; do
|
||||
threads=$(echo "$threads" | xargs) # trim whitespace
|
||||
result=$(run_benchmark "pp" "$threads")
|
||||
echo "$result" >> "$OUTPUT_CSV"
|
||||
done
|
||||
|
||||
# Test TG workloads
|
||||
IFS=',' read -ra TG_ARRAY <<< "$TG_THREADS"
|
||||
for threads in "${TG_ARRAY[@]}"; do
|
||||
threads=$(echo "$threads" | xargs) # trim whitespace
|
||||
result=$(run_benchmark "tg" "$threads")
|
||||
echo "$result" >> "$OUTPUT_CSV"
|
||||
done
|
||||
|
||||
# Cleanup
|
||||
rm -f "$TEMP_LOG" "$BENCH_OUTPUT" "$PID_FILE"
|
||||
|
||||
echo ""
|
||||
echo "=== Benchmark Complete ==="
|
||||
echo "Results saved to: $OUTPUT_CSV"
|
||||
echo ""
|
||||
cat "$OUTPUT_CSV"
|
||||