diff --git a/README.md b/README.md
index bfb09a6..2cc2a73 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.5
The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.
-**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.5x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md).
+**Latest optimization** introduces parallel kernel implementations with configurable tiling and embedding quantization support, achieving **1.15x to 2.1x** additional speedup over the original implementation across different hardware platforms and workloads. For detailed technical information, see the [optimization guide](src/README.md).
diff --git a/assets/performance.png b/assets/performance.png
index 03d477d..078fd3f 100644
Binary files a/assets/performance.png and b/assets/performance.png differ
diff --git a/src/README.md b/src/README.md
index 1a8ef2b..f713b9a 100644
--- a/src/README.md
+++ b/src/README.md
@@ -153,40 +153,40 @@ Comparison of optimized parallel kernels vs. original implementation:
**Test Configuration:**
- Model: BitNet-b1.58-2B-4T
-- Hardware: AMD EPYC 7V13 64-Core Processor
+- Hardware: AMD EPYC 7V13
- Threads: 1 / 2 / 4 / 8 / 12 / 16
- Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128)
- Method: Activation Parallel
-

+
**Test Configuration:**
- Model: BitNet-b1.58-2B-4T
-- Hardware: ARM Core
-- Threads: 1 / 2 / 4 / 8
+- Hardware: Intel i7-13800H
+- Threads: 1 / 2 / 4 / 6
- Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128)
-- Method: Activation Parallel with DOTPROD
+- Method: Activation Parallel
-

+
**Test Configuration:**
- Model: BitNet-b1.58-2B-4T
-- Hardware: ARM Core
+- Hardware: Cobalt 100
- Threads: 1 / 2 / 4 / 8
- Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128)
-- Method: Activation Parallel without DOTPROD
+- Method: Activation Parallel
-

+
diff --git a/src/assets/performance_arm_dotprod.png b/src/assets/performance_arm_dotprod.png
deleted file mode 100644
index 5163b3c..0000000
Binary files a/src/assets/performance_arm_dotprod.png and /dev/null differ
diff --git a/src/assets/performance_arm_no_dotprod.png b/src/assets/performance_arm_no_dotprod.png
deleted file mode 100644
index da980be..0000000
Binary files a/src/assets/performance_arm_no_dotprod.png and /dev/null differ
diff --git a/src/assets/performance_comparison_amd_epyc.png b/src/assets/performance_comparison_amd_epyc.png
new file mode 100644
index 0000000..6ebdb3d
Binary files /dev/null and b/src/assets/performance_comparison_amd_epyc.png differ
diff --git a/src/assets/performance_comparison_cobalt100_dotprod.png b/src/assets/performance_comparison_cobalt100_dotprod.png
new file mode 100644
index 0000000..4d0ef8c
Binary files /dev/null and b/src/assets/performance_comparison_cobalt100_dotprod.png differ
diff --git a/src/assets/performance_comparison_i7-13800h.png b/src/assets/performance_comparison_i7-13800h.png
new file mode 100644
index 0000000..e486d66
Binary files /dev/null and b/src/assets/performance_comparison_i7-13800h.png differ
diff --git a/src/assets/performance_x86.png b/src/assets/performance_x86.png
deleted file mode 100644
index 31a2332..0000000
Binary files a/src/assets/performance_x86.png and /dev/null differ
diff --git a/utils/test_power.sh b/utils/test_power.sh
new file mode 100755
index 0000000..79a1a68
--- /dev/null
+++ b/utils/test_power.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# Monitor power consumption for llama-bench with different thread configurations
+# Usage: ./monitor_power.sh
+# Example: ./monitor_power.sh models/model.gguf results.csv "1,2,4,8" "1,2,4,8"
+
+set -e
+
+# Parse arguments
+if [ $# -ne 4 ]; then
+ echo "Usage: $0 "
+ echo "Example: $0 models/model.gguf results.csv \"1,2,4,8\" \"1,2,4,8\""
+ exit 1
+fi
+
+MODEL_PATH="$1"
+OUTPUT_CSV="$2"
+PP_THREADS="$3"
+TG_THREADS="$4"
+
+TEMP_LOG="/tmp/power_monitor_$$.log"
+PID_FILE="/tmp/monitor_$$.pid"
+BENCH_OUTPUT="/tmp/bench_output_$$.txt"
+
+# Validate model exists
+if [ ! -f "$MODEL_PATH" ]; then
+ echo "Error: Model file not found: $MODEL_PATH"
+ exit 1
+fi
+
+# Create output directory if needed
+mkdir -p "$(dirname "$OUTPUT_CSV")"
+
+# Function to monitor CPU stats
+monitor_cpu() {
+ local log_file="$1"
+ echo "Timestamp,CPU_Usage(%),Avg_Freq(MHz)" > "$log_file"
+ while [ -f "$PID_FILE" ]; do
+ cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100-$8}')
+ avg_freq=$(grep "cpu MHz" /proc/cpuinfo | awk '{sum+=$4; count++} END {printf "%.0f", sum/count}')
+ timestamp=$(date +%s.%N)
+ echo "$timestamp,$cpu_usage,$avg_freq" >> "$log_file"
+ sleep 0.5
+ done
+}
+
+# Function to calculate average power
+calculate_power() {
+ local log_file="$1"
+ awk -F',' 'NR>1 {sum_cpu+=$2; count++} END {
+ if (count > 0) {
+ avg_cpu = sum_cpu/count
+ est_power = avg_cpu * 200 / 100
+ printf "%.2f", est_power
+ } else {
+ print "0"
+ }
+ }' "$log_file"
+}
+
+# Function to extract throughput from llama-bench output
+extract_throughput() {
+ local bench_output="$1"
+ local workload="$2"
+ grep "$workload" "$bench_output" | awk '{
+ # Extract mean from "mean ± std" format
+ for (i=1; i<=NF; i++) {
+ if ($(i+1) == "±") {
+ printf "%.2f", $i
+ exit
+ }
+ }
+ }'
+}
+
+# Function to run single benchmark
+run_benchmark() {
+ local workload="$1" # "pp" or "tg"
+ local threads="$2"
+ local n_flag=""
+
+ if [ "$workload" = "pp" ]; then
+ n_flag="-n 0"
+ workload_name="pp128"
+ else
+ n_flag="-n 128"
+ workload_name="tg128"
+ fi
+
+ # Output progress to stderr (won't be captured in CSV)
+ echo "Testing $workload_name with $threads threads..." >&2
+
+ # Start monitoring
+ touch "$PID_FILE"
+ monitor_cpu "$TEMP_LOG" &
+ local monitor_pid=$!
+
+ # Run benchmark
+ ./build/bin/llama-bench -m "$MODEL_PATH" -p 128 $n_flag -t "$threads" -ngl 0 > "$BENCH_OUTPUT" 2>&1
+
+ # Stop monitoring
+ rm -f "$PID_FILE"
+ wait $monitor_pid 2>/dev/null || true
+
+ # Extract results
+ local throughput=$(extract_throughput "$BENCH_OUTPUT" "$workload_name")
+ local power=$(calculate_power "$TEMP_LOG")
+
+ if [ -z "$throughput" ] || [ "$throughput" = "0" ]; then
+ echo "Warning: Failed to extract throughput for $workload_name, threads=$threads" >&2
+ throughput="0"
+ fi
+
+ # Calculate J/t (Joules per token)
+ local j_per_token=$(awk -v p="$power" -v t="$throughput" 'BEGIN {
+ if (t > 0) printf "%.4f", p/t; else print "0"
+ }')
+
+ # Output progress to stderr
+ echo " Throughput: $throughput t/s, Power: $power W, Energy: $j_per_token J/t" >&2
+
+ # Only output CSV line to stdout (this will be captured)
+ echo "$workload_name,$threads,$throughput,$power,$j_per_token"
+}
+
+# Initialize CSV
+echo "Workload,Threads,Throughput(t/s),Power(W),Energy(J/t)" > "$OUTPUT_CSV"
+
+# Test PP workloads
+IFS=',' read -ra PP_ARRAY <<< "$PP_THREADS"
+for threads in "${PP_ARRAY[@]}"; do
+ threads=$(echo "$threads" | xargs) # trim whitespace
+ result=$(run_benchmark "pp" "$threads")
+ echo "$result" >> "$OUTPUT_CSV"
+done
+
+# Test TG workloads
+IFS=',' read -ra TG_ARRAY <<< "$TG_THREADS"
+for threads in "${TG_ARRAY[@]}"; do
+ threads=$(echo "$threads" | xargs) # trim whitespace
+ result=$(run_benchmark "tg" "$threads")
+ echo "$result" >> "$OUTPUT_CSV"
+done
+
+# Cleanup
+rm -f "$TEMP_LOG" "$BENCH_OUTPUT" "$PID_FILE"
+
+echo ""
+echo "=== Benchmark Complete ==="
+echo "Results saved to: $OUTPUT_CSV"
+echo ""
+cat "$OUTPUT_CSV"