diff --git a/README.md b/README.md index 6f949fd..4318061 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,7 @@ optional arguments: Directory to save the logging info --quant-type {i2_s,tl1}, -q {i2_s,tl1} Quantization type - --quant-embd Quantize the embeddings to f16 + --quant-embd Quantize the embeddings to q6_k --use-pretuned, -p Use the pretuned kernel parameters ## Usage diff --git a/include/gemm-config.h b/include/gemm-config.h index d766dfb..6a88c42 100644 --- a/include/gemm-config.h +++ b/include/gemm-config.h @@ -5,19 +5,31 @@ #define COL_BLOCK_SIZE 128 #define PARALLEL_SIZE 4 #else - #define ROW_BLOCK_SIZE 32 - #define COL_BLOCK_SIZE 4 - #define PARALLEL_SIZE 4 -#endif + #define ROW_BLOCK_SIZE 128 + #define COL_BLOCK_SIZE 32 + #define PARALLEL_SIZE 8 +#endif // ACT_PARALLEL #elif defined(__ARM_NEON) +#if defined(__ARM_FEATURE_DOTPROD) #if defined(ACT_PARALLEL) #define ROW_BLOCK_SIZE 8 - #define COL_BLOCK_SIZE 64 + #define COL_BLOCK_SIZE 256 #define PARALLEL_SIZE 8 #else - #define ROW_BLOCK_SIZE 16 - #define COL_BLOCK_SIZE 4 + #define ROW_BLOCK_SIZE 64 + #define COL_BLOCK_SIZE 16 + #define PARALLEL_SIZE 2 +#endif // ACT_PARALLEL +#else +#if defined(ACT_PARALLEL) + #define ROW_BLOCK_SIZE 8 + #define COL_BLOCK_SIZE 256 #define PARALLEL_SIZE 4 -#endif -#endif +#else + #define ROW_BLOCK_SIZE 128 + #define COL_BLOCK_SIZE 32 + #define PARALLEL_SIZE 4 +#endif // ACT_PARALLEL +#endif // __ARM_FEATURE_DOTPROD +#endif // __AVX__ diff --git a/setup_env.py b/setup_env.py index f15d65f..7d84ed7 100644 --- a/setup_env.py +++ b/setup_env.py @@ -136,12 +136,12 @@ def prepare_model(): # quantize to i2s if platform.system() != "Windows": if quant_embd: - run_command(["./build/bin/llama-quantize", "--token-embedding-type", "f16", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s") + run_command(["./build/bin/llama-quantize", "--token-embedding-type", "q6_k", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s") else: run_command(["./build/bin/llama-quantize", f32_model, i2s_model, "I2_S", "1"], log_step="quantize_to_i2s") else: if quant_embd: - run_command(["./build/bin/Release/llama-quantize", "--token-embedding-type", "f16", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s") + run_command(["./build/bin/Release/llama-quantize", "--token-embedding-type", "q6_k", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s") else: run_command(["./build/bin/Release/llama-quantize", f32_model, i2s_model, "I2_S", "1"], log_step="quantize_to_i2s") @@ -228,7 +228,7 @@ def parse_args(): parser.add_argument("--model-dir", "-md", type=str, help="Directory to save/load the model", default="models") parser.add_argument("--log-dir", "-ld", type=str, help="Directory to save the logging info", default="logs") parser.add_argument("--quant-type", "-q", type=str, help="Quantization type", choices=SUPPORTED_QUANT_TYPES[arch], default="i2_s") - parser.add_argument("--quant-embd", action="store_true", help="Quantize the embeddings to f16") + parser.add_argument("--quant-embd", action="store_true", help="Quantize the embeddings to q6_k") parser.add_argument("--use-pretuned", "-p", action="store_true", help="Use the pretuned kernel parameters") return parser.parse_args() diff --git a/src/README.md b/src/README.md index b7eaef4..1a8ef2b 100644 --- a/src/README.md +++ b/src/README.md @@ -20,28 +20,27 @@ This update provides significant performance improvements for BitNet inference o ### Configuration Options -The `gemm-config.h` file controls kernel behavior: +The `include/gemm-config.h` file controls kernel behavior: ```c -#define ACT_PARALLEL // Enable activation parallelism, otherwise weight parallelism - -#if defined(ACT_PARALLEL) - #define ROW_BLOCK_SIZE 4 // Number of rows processed per block - #define COL_BLOCK_SIZE 32 // Number of columns processed per block - #define PARALLEL_SIZE 4 // Degree of parallelism -#else - #define ROW_BLOCK_SIZE 32 - #define COL_BLOCK_SIZE 4 - #define PARALLEL_SIZE 4 -#endif +#define ROW_BLOCK_SIZE 4 +#define COL_BLOCK_SIZE 128 +#define PARALLEL_SIZE 4 ``` -Modify these values based on your CPU cache size and architecture for optimal performance. Users can fine-tune performance on their machine through `gemm-config.h`. +Modify these values based on your CPU cache size and architecture for optimal performance. Users can fine-tune performance on their machine through `include/gemm-config.h`. ### Enabling Embedding Quantization To use embedding quantization for additional speedup: +**Using setup_env.py:** +```bash +python setup_env.py --quant-embd +``` +This automatically converts embeddings to Q6_K format. + +**Manual conversion:** ```bash build/bin/llama-quantize --token-embedding-type Q6_K models/BitNet-b1.58-2B-4T/ggml-model-f32.gguf models/BitNet-b1.58-2B-4T/ggml-model-i2_s-embed-q6_k.gguf I2_S 1 1 ``` @@ -52,51 +51,104 @@ build/bin/llama-quantize --token-embedding-type Q6_K models/BitNet-b1.58-2B-4T/g The kernel implements two parallelization strategies: -- **Weight Parallel:** Reduces kernel launch overhead by processing multiple weight rows/columns in a single kernel call -- **Activation Parallel:** Built on top of weight parallel, further reduces the unpack overhead when reading I2_S format weights by amortizing the unpacking cost across multiple activation elements -- **Recommendation:** For I2_S quantization format, activation parallel is recommended and used in all subsequent benchmarks +- **Weight Parallel:** Processes multiple weight rows/columns in a single kernel call, reducing kernel launch overhead. -**Key Optimizations:** -- **Vectorized Operations:** Utilizes SIMD instructions (AVX2 for x86, NEON for ARM) to process multiple elements simultaneously -- **Parallel Accumulation:** Processes multiple weight-activation pairs in parallel, reducing sequential dependencies -- **Reduced Memory Latency:** Optimized memory access patterns minimize cache misses +- **Activation Parallel:** Built on top of weight parallel, amortizes the I2_S weight unpacking cost across multiple activation elements. -**Schematic diagram:** +**Recommendation:** For I2_S quantization format, activation parallel is recommended due to the unpack operation benefits. The current kernel defaults to activation parallel. -weight_parallel -activation_parallel +**Kernel Performance Comparison:** -**Code Structure:** -- `ggml_vec_dot_i2_i8_s_1xN()`: Processes N rows in parallel -- `ggml_vec_dot_i2_i8_s_Nx1()`: Processes N columns in parallel -- Automatic dispatch based on `ACT_PARALLEL` configuration +
+ +Test configuration: AMD EPYC 7V13 (x86), 1 threads, time in milliseconds (mean±std) + +| Matrix Size | No Parallel | Weight Parallel | Activation Parallel | +|:---:|:---:|:---:|:---:| +| [1, 2048] × [2048, 2048] | 0.075±0.012 | **0.058±0.007** | 0.076±0.011 | +| [32, 2048] × [2048, 2048] | 2.400±0.041 | 1.599±0.020 | **1.202±0.018** | +| [128, 2048] × [2048, 2048] | 10.820±0.039 | 6.458±0.168 | **5.805±0.039** | +| [256, 2048] × [2048, 2048] | 21.669±0.080 | 12.739±0.183 | **11.882±0.040** | +| [512, 2048] × [2048, 2048] | 43.257±0.083 | 25.680±0.335 | **23.342±0.082** | +| [2048, 2048] × [2048, 2048] | 173.175±0.214 | 103.112±0.552 | **93.276±0.612** | +| [128, 2048] × [2048, 8192] | 43.345±0.090 | 25.541±0.239 | **23.528±0.052** | +| [128, 8192] × [8192, 2048] | 38.085±0.162 | 23.866±0.096 | **22.569±0.132** | + +
### 2. GEMM/GEMV Integration with llama.cpp Integrated I2_S quantization format into llama.cpp's compute graph: -- **GEMV Operations:** Optimized matrix-vector multiplication for token generation -- **GEMM Operations:** Efficient matrix-matrix multiplication for batch processing -- **Tiling Strategy:** Configurable block sizes for optimal cache utilization +- **GEMV Operations:** Optimized matrix-vector multiplication for token generation. +- **GEMM Operations:** Efficient matrix-matrix multiplication for prompt processing. +- **Tiling Strategy:** Configurable block sizes for optimal cache utilization. -**Benefits:** -- No modifications required to model architecture code -- Compatible with existing llama.cpp optimizations -- Supports dynamic batching and sequence parallelism +### 3. Configuration Fine-tuning -### 3. Embedding Quantization (Q6_K) +Fine-tuning kernel parameters for optimal performance on specific hardware: -Quantizes embedding layers to 6-bit precision: +**Example Configuration (x86, AMD EPYC 7V13):** +- Method: Activation Parallel +- Threads: 8 +- Workload: 128 prompt tokens (pp128) -- **Quantization Method:** k-quants (Q6_K) with grouped quantization -- **Accuracy Preservation:** Maintains >99% similarity to FP32 embeddings -- **Memory Savings:** ~5.33x reduction in embedding size (FP32 → Q6_K) -- **Dequantization:** Automatic dequantization during forward pass +**Fine-tuning Parameters:** +- **Parallelism Degree:** [2, 4, 8] +- **Row Block Size:** [2, 4, 8, 16, 32] +- **Column Block Size:** [32, 64, 128, 256, 512, 1024] + +**Fine-tuning Results:** + +
+ +fine_tune_result + +*Shows throughput (tokens/s) for various configurations.* + +
+ +**Optimal Configuration:** Under this setup (x86, 8 threads, pp128), the best performance is achieved with parallelism degree = 4, row block size = 4, and column block size = 128. + +### 4. Embedding Quantization + +Evaluated multiple embedding quantization formats to balance memory usage, model quality, and inference speed: + +**Perplexity Comparison:** + +
+ +Test configuration: BitNet-b1.58-2B-4T, TG128 + +| Embedding Type | Wikitext | PTB | LAMBADA | IMDB | AG NEWS | +|:---:|:---:|:---:|:---:|:---:|:---:| +| **F32** | 17.1090±0.1278 | 33.0858±0.4886 | 43.2850±0.6363 | 29.3016±0.2890 | 36.7686±0.3920 | +| **F16** | 17.1090±0.1278 | 33.0858±0.4886 | 43.2850±0.6363 | 29.3016±0.2890 | 36.7686±0.3920 | +| **Q8_0** | 17.1197±0.1280 | 33.1181±0.4893 | 43.2891±0.6364 | 29.3133±0.2892 | 36.7740±0.3920 | +| **Q6_K** | 17.1487±0.1282 | 33.2203±0.4914 | 43.3046±0.6362 | 29.3491±0.2897 | 36.7972±0.3921 | +| **Q5_0** | 17.2379±0.1288 | 33.2439±0.4907 | 43.4631±0.6379 | 29.5481±0.2920 | 36.8539±0.3924 | +| **Q4_0** | 17.3529±0.1300 | 33.7754±0.5001 | 44.4552±0.6559 | 30.1044±0.2978 | 37.3985±0.3997 | +| **Q3_K** | 17.6434±0.1320 | 34.3914±0.5089 | 45.4591±0.6735 | 30.8476±0.3069 | 39.5692±0.4259 | +| **I2_S** | N/A | N/A | N/A | N/A | N/A | + +**N/A indicates model failure due to extreme quantization.* + +
+ +**Inference Speed Comparison:** + +
+ +embedding_throughput + +*Token generation throughput (tg128) for different embedding quantization types.* + +
+ +**Recommendation:** Based on comprehensive evaluation of memory footprint, perplexity preservation, and inference speed, **Q6_K** is selected as the optimal embedding quantization format. ## Performance -### End-to-End Inference Performance - Comparison of optimized parallel kernels vs. original implementation: **Test Configuration:** @@ -106,27 +158,11 @@ Comparison of optimized parallel kernels vs. original implementation: - Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128) - Method: Activation Parallel -**Prompt Processing (pp128)** +
-| Threads | Original | Activation Parallel | Speedup | -|---------|----------|---------------------|---------| -| 1 | | 47.49 ± 0.16 | | -| 2 | | 89.94 ± 0.25 | | -| 4 | | 169.61 ± 2.64 | | -| 8 | | 295.70 ± 3.19 | | -| 12 | | 403.04 ± 0.49 | | -| 16 | | 521.58 ± 0.95 | | +x86_performance -**Token Generation (tg128)** - -| Threads | Original | Activation Parallel | Speedup | -|---------|----------|---------------------|---------| -| 1 | | 17.65 ± 0.16 | | -| 2 | | 32.24 ± 0.64 | | -| 4 | | 54.34 ± 0.12 | | -| 8 | | 74.42 ± 0.25 | | -| 12 | | 76.37 ± 0.18 | | -| 16 | | 74.02 ± 0.15 | | +
**Test Configuration:** - Model: BitNet-b1.58-2B-4T @@ -135,23 +171,11 @@ Comparison of optimized parallel kernels vs. original implementation: - Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128) - Method: Activation Parallel with DOTPROD -**Prompt Processing (pp128)** +
-| Threads | Original | Activation Parallel | Speedup | -|---------|----------|---------------------|---------| -| 1 | | | | -| 2 | | | | -| 4 | | | | -| 8 | | | | +arm_dotprod_performance -**Token Generation (tg128)** - -| Threads | Original | Activation Parallel | Speedup | -|---------|----------|---------------------|---------| -| 1 | | | | -| 2 | | | | -| 4 | | | | -| 8 | | | | +
**Test Configuration:** - Model: BitNet-b1.58-2B-4T @@ -160,48 +184,14 @@ Comparison of optimized parallel kernels vs. original implementation: - Test: 128 prompt tokens (pp128) + 128 generated tokens (tg128) - Method: Activation Parallel without DOTPROD -**Prompt Processing (pp128)** +
-| Threads | Original | Activation Parallel | Speedup | -|---------|----------|---------------------|---------| -| 1 | | | | -| 2 | | | | -| 4 | | | | -| 8 | | | | +arm_no_dotprod_performance -**Token Generation (tg128)** - -| Threads | Original | Activation Parallel | Speedup | -|---------|----------|---------------------|---------| -| 1 | | | | -| 2 | | | | -| 4 | | | | -| 8 | | | | - -**Speedup Visualization:** - -``` -[TODO] -``` - -### Embedding Quantize Evaluation - -**Performance test:** - -performance_test - -**Quality test:** - - -quality_test +
## Technical Details -### Commit Information - -- **Latest Commit:** `43da5e5f760887d5b061c95605cd89a7e63db76b` -- **Baseline Commit:** `404980eecae38affa4871c3e419eae3f44536a95` - ### Key Files Modified - `src/ggml-bitnet-mad.cpp`: Parallel kernel implementations diff --git a/src/assets/activation_parallel.png b/src/assets/activation_parallel.png deleted file mode 100644 index 585f38a..0000000 Binary files a/src/assets/activation_parallel.png and /dev/null differ diff --git a/src/assets/embedding_throughput.png b/src/assets/embedding_throughput.png new file mode 100644 index 0000000..b3ebb82 Binary files /dev/null and b/src/assets/embedding_throughput.png differ diff --git a/src/assets/fine_tuning_result.png b/src/assets/fine_tuning_result.png new file mode 100644 index 0000000..edb19cd Binary files /dev/null and b/src/assets/fine_tuning_result.png differ diff --git a/src/assets/performance_arm_dotprod.png b/src/assets/performance_arm_dotprod.png new file mode 100644 index 0000000..5163b3c Binary files /dev/null and b/src/assets/performance_arm_dotprod.png differ diff --git a/src/assets/performance_arm_no_dotprod.png b/src/assets/performance_arm_no_dotprod.png new file mode 100644 index 0000000..da980be Binary files /dev/null and b/src/assets/performance_arm_no_dotprod.png differ diff --git a/src/assets/performance_test.png b/src/assets/performance_test.png deleted file mode 100644 index 587b22e..0000000 Binary files a/src/assets/performance_test.png and /dev/null differ diff --git a/src/assets/performance_x86.png b/src/assets/performance_x86.png new file mode 100644 index 0000000..31a2332 Binary files /dev/null and b/src/assets/performance_x86.png differ diff --git a/src/assets/quality_test.png b/src/assets/quality_test.png deleted file mode 100644 index 29f7969..0000000 Binary files a/src/assets/quality_test.png and /dev/null differ diff --git a/src/assets/weight_parallel.png b/src/assets/weight_parallel.png deleted file mode 100644 index b7567d5..0000000 Binary files a/src/assets/weight_parallel.png and /dev/null differ