diff --git a/README.md b/README.md
index 6f949fd..4318061 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,7 @@ optional arguments:
Directory to save the logging info
--quant-type {i2_s,tl1}, -q {i2_s,tl1}
Quantization type
- --quant-embd Quantize the embeddings to f16
+ --quant-embd Quantize the embeddings to q6_k
--use-pretuned, -p Use the pretuned kernel parameters
## Usage
diff --git a/include/gemm-config.h b/include/gemm-config.h
index d766dfb..6a88c42 100644
--- a/include/gemm-config.h
+++ b/include/gemm-config.h
@@ -5,19 +5,31 @@
#define COL_BLOCK_SIZE 128
#define PARALLEL_SIZE 4
#else
- #define ROW_BLOCK_SIZE 32
- #define COL_BLOCK_SIZE 4
- #define PARALLEL_SIZE 4
-#endif
+ #define ROW_BLOCK_SIZE 128
+ #define COL_BLOCK_SIZE 32
+ #define PARALLEL_SIZE 8
+#endif // ACT_PARALLEL
#elif defined(__ARM_NEON)
+#if defined(__ARM_FEATURE_DOTPROD)
#if defined(ACT_PARALLEL)
#define ROW_BLOCK_SIZE 8
- #define COL_BLOCK_SIZE 64
+ #define COL_BLOCK_SIZE 256
#define PARALLEL_SIZE 8
#else
- #define ROW_BLOCK_SIZE 16
- #define COL_BLOCK_SIZE 4
+ #define ROW_BLOCK_SIZE 64
+ #define COL_BLOCK_SIZE 16
+ #define PARALLEL_SIZE 2
+#endif // ACT_PARALLEL
+#else
+#if defined(ACT_PARALLEL)
+ #define ROW_BLOCK_SIZE 8
+ #define COL_BLOCK_SIZE 256
#define PARALLEL_SIZE 4
-#endif
-#endif
+#else
+ #define ROW_BLOCK_SIZE 128
+ #define COL_BLOCK_SIZE 32
+ #define PARALLEL_SIZE 4
+#endif // ACT_PARALLEL
+#endif // __ARM_FEATURE_DOTPROD
+#endif // __AVX__
diff --git a/setup_env.py b/setup_env.py
index f15d65f..7d84ed7 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -136,12 +136,12 @@ def prepare_model():
# quantize to i2s
if platform.system() != "Windows":
if quant_embd:
- run_command(["./build/bin/llama-quantize", "--token-embedding-type", "f16", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
+ run_command(["./build/bin/llama-quantize", "--token-embedding-type", "q6_k", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
else:
run_command(["./build/bin/llama-quantize", f32_model, i2s_model, "I2_S", "1"], log_step="quantize_to_i2s")
else:
if quant_embd:
- run_command(["./build/bin/Release/llama-quantize", "--token-embedding-type", "f16", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
+ run_command(["./build/bin/Release/llama-quantize", "--token-embedding-type", "q6_k", f32_model, i2s_model, "I2_S", "1", "1"], log_step="quantize_to_i2s")
else:
run_command(["./build/bin/Release/llama-quantize", f32_model, i2s_model, "I2_S", "1"], log_step="quantize_to_i2s")
@@ -228,7 +228,7 @@ def parse_args():
parser.add_argument("--model-dir", "-md", type=str, help="Directory to save/load the model", default="models")
parser.add_argument("--log-dir", "-ld", type=str, help="Directory to save the logging info", default="logs")
parser.add_argument("--quant-type", "-q", type=str, help="Quantization type", choices=SUPPORTED_QUANT_TYPES[arch], default="i2_s")
- parser.add_argument("--quant-embd", action="store_true", help="Quantize the embeddings to f16")
+ parser.add_argument("--quant-embd", action="store_true", help="Quantize the embeddings to q6_k")
parser.add_argument("--use-pretuned", "-p", action="store_true", help="Use the pretuned kernel parameters")
return parser.parse_args()
diff --git a/src/README.md b/src/README.md
index b7eaef4..1a8ef2b 100644
--- a/src/README.md
+++ b/src/README.md
@@ -20,28 +20,27 @@ This update provides significant performance improvements for BitNet inference o
### Configuration Options
-The `gemm-config.h` file controls kernel behavior:
+The `include/gemm-config.h` file controls kernel behavior:
```c
-#define ACT_PARALLEL // Enable activation parallelism, otherwise weight parallelism
-
-#if defined(ACT_PARALLEL)
- #define ROW_BLOCK_SIZE 4 // Number of rows processed per block
- #define COL_BLOCK_SIZE 32 // Number of columns processed per block
- #define PARALLEL_SIZE 4 // Degree of parallelism
-#else
- #define ROW_BLOCK_SIZE 32
- #define COL_BLOCK_SIZE 4
- #define PARALLEL_SIZE 4
-#endif
+#define ROW_BLOCK_SIZE 4
+#define COL_BLOCK_SIZE 128
+#define PARALLEL_SIZE 4
```
-Modify these values based on your CPU cache size and architecture for optimal performance. Users can fine-tune performance on their machine through `gemm-config.h`.
+Modify these values based on your CPU cache size and architecture for optimal performance. Users can fine-tune performance on their machine through `include/gemm-config.h`.
### Enabling Embedding Quantization
To use embedding quantization for additional speedup:
+**Using setup_env.py:**
+```bash
+python setup_env.py --quant-embd
+```
+This automatically converts embeddings to Q6_K format.
+
+**Manual conversion:**
```bash
build/bin/llama-quantize --token-embedding-type Q6_K models/BitNet-b1.58-2B-4T/ggml-model-f32.gguf models/BitNet-b1.58-2B-4T/ggml-model-i2_s-embed-q6_k.gguf I2_S 1 1
```
@@ -52,51 +51,104 @@ build/bin/llama-quantize --token-embedding-type Q6_K models/BitNet-b1.58-2B-4T/g
The kernel implements two parallelization strategies:
-- **Weight Parallel:** Reduces kernel launch overhead by processing multiple weight rows/columns in a single kernel call
-- **Activation Parallel:** Built on top of weight parallel, further reduces the unpack overhead when reading I2_S format weights by amortizing the unpacking cost across multiple activation elements
-- **Recommendation:** For I2_S quantization format, activation parallel is recommended and used in all subsequent benchmarks
+- **Weight Parallel:** Processes multiple weight rows/columns in a single kernel call, reducing kernel launch overhead.
-**Key Optimizations:**
-- **Vectorized Operations:** Utilizes SIMD instructions (AVX2 for x86, NEON for ARM) to process multiple elements simultaneously
-- **Parallel Accumulation:** Processes multiple weight-activation pairs in parallel, reducing sequential dependencies
-- **Reduced Memory Latency:** Optimized memory access patterns minimize cache misses
+- **Activation Parallel:** Built on top of weight parallel, amortizes the I2_S weight unpacking cost across multiple activation elements.
-**Schematic diagram:**
+**Recommendation:** For I2_S quantization format, activation parallel is recommended due to the unpack operation benefits. The current kernel defaults to activation parallel.
-
-
+**Kernel Performance Comparison:**
-**Code Structure:**
-- `ggml_vec_dot_i2_i8_s_1xN()`: Processes N rows in parallel
-- `ggml_vec_dot_i2_i8_s_Nx1()`: Processes N columns in parallel
-- Automatic dispatch based on `ACT_PARALLEL` configuration
+
+
+*Shows throughput (tokens/s) for various configurations.*
+
+
+
+*Token generation throughput (tg128) for different embedding quantization types.*
+
+
-**Token Generation (tg128)**
-
-| Threads | Original | Activation Parallel | Speedup |
-|---------|----------|---------------------|---------|
-| 1 | | 17.65 ± 0.16 | |
-| 2 | | 32.24 ± 0.64 | |
-| 4 | | 54.34 ± 0.12 | |
-| 8 | | 74.42 ± 0.25 | |
-| 12 | | 76.37 ± 0.18 | |
-| 16 | | 74.02 ± 0.15 | |
+
-**Token Generation (tg128)**
-
-| Threads | Original | Activation Parallel | Speedup |
-|---------|----------|---------------------|---------|
-| 1 | | | |
-| 2 | | | |
-| 4 | | | |
-| 8 | | | |
+
-**Token Generation (tg128)**
-
-| Threads | Original | Activation Parallel | Speedup |
-|---------|----------|---------------------|---------|
-| 1 | | | |
-| 2 | | | |
-| 4 | | | |
-| 8 | | | |
-
-**Speedup Visualization:**
-
-```
-[TODO]
-```
-
-### Embedding Quantize Evaluation
-
-**Performance test:**
-
-
-
-**Quality test:**
-
-
-
+