/** * Standalone benchmark for ggml_gemm_i2_i8_s kernel * * This program tests the performance of the ggml_gemm_i2_i8_s kernel * with configurable matrix sizes and iteration counts. * * Usage: ./test_gemm_kernel [options] * -n : embedding dimension (must be divisible by 4, default: 2048) * -r : number of rows in matrix Y (default: 32) * -c : number of columns in matrix X (default: 128) * -i : number of iterations (default: 1000) * -w : number of warmup iterations (default: 10) */ #include #include #include #include #include #include #include // Include necessary headers #include "../include/gemm-config.h" // Function declarations (from ggml-quants.h) extern "C" void ggml_vec_dot_i2_i8_s(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc); // GEMM kernel definition void ggml_gemm_i2_i8_s(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { #if defined(ACT_PARALLEL) const int64_t row_block = ROW_BLOCK_SIZE; const int64_t col_block = COL_BLOCK_SIZE; for (int64_t c0 = 0; c0 < nc; c0 += col_block) { int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0); for (int64_t r0 = 0; r0 < nr; r0 += row_block) { int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0); const void * vy_r = (const uint8_t *)vy + r0 * n; for (int64_t c = 0; c < cur_c; ++c) { const int64_t col = c0 + c; float * s_col = s + col; const void * vx_col = (const uint8_t *)vx + col * n / 4; ggml_vec_dot_i2_i8_s(n, s_col + r0 * bs, bs, vx_col, n, vy_r, n, cur_r); } } } #else const int64_t row_block = ROW_BLOCK_SIZE; const int64_t col_block = COL_BLOCK_SIZE; for (int64_t r0 = 0; r0 < nr; r0 += row_block) { int64_t cur_r = (r0 + row_block <= nr) ? row_block : (nr - r0); for (int64_t c0 = 0; c0 < nc; c0 += col_block) { int64_t cur_c = (c0 + col_block <= nc) ? col_block : (nc - c0); const void * vx_c = (const uint8_t *)vx + c0 * n / 4; for (int64_t r = 0; r < cur_r; ++r) { const int64_t row = r0 + r; float * s_row = s + row * bs; const void * vy_row = (const uint8_t *)vy + row * n; ggml_vec_dot_i2_i8_s(n, s_row + c0, bs, vx_c, n, vy_row, n, cur_c); } } } #endif } // Helper function to get current time in nanoseconds double get_time_ns() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * 1e9 + ts.tv_nsec; } // Initialize matrix with random i2 values (2-bit quantized) void init_matrix_i2(uint8_t* data, int n, int cols) { // i2 format: 4 values per byte (2 bits each) int total_bytes = n * cols / 4; for (int i = 0; i < total_bytes; i++) { data[i] = rand() & 0xFF; } } // Initialize matrix with random i8 values void init_matrix_i8(int8_t* data, int n, int rows) { int total_elements = n * rows; for (int i = 0; i < total_elements; i++) { data[i] = (int8_t)((rand() % 256) - 128); } } // Benchmark configuration struct BenchmarkConfig { int n; // embedding dimension (must be divisible by 4) int nr; // number of rows in Y matrix int nc; // number of columns in X matrix int iterations; // number of benchmark iterations int warmup; // number of warmup iterations }; void print_config(const BenchmarkConfig& config) { printf("=" "=%.78s\n", "==============================================================================="); printf("Benchmark Configuration:\n"); printf("=" "=%.78s\n", "==============================================================================="); printf(" Embedding dimension (n) : %d\n", config.n); printf(" Matrix Y rows (nr) : %d\n", config.nr); printf(" Matrix X columns (nc) : %d\n", config.nc); printf(" Iterations : %d\n", config.iterations); printf(" Warmup iterations : %d\n", config.warmup); printf("\nMatrix sizes:\n"); printf(" X (i2): %d x %d (%.2f KB)\n", config.nc, config.n, (config.nc * config.n / 4) / 1024.0); printf(" Y (i8): %d x %d (%.2f KB)\n", config.nr, config.n, (config.nr * config.n) / 1024.0); printf(" S (f32): %d x %d (%.2f KB)\n", config.nr, config.nc, (config.nr * config.nc * sizeof(float)) / 1024.0); printf("\nGEMM Config:\n"); #if defined(ACT_PARALLEL) printf(" ACT_PARALLEL : ON\n"); #else printf(" ACT_PARALLEL : OFF\n"); #endif printf(" ROW_BLOCK_SIZE : %d\n", ROW_BLOCK_SIZE); printf(" COL_BLOCK_SIZE : %d\n", COL_BLOCK_SIZE); printf(" PARALLEL_SIZE : %d\n", PARALLEL_SIZE); printf("=" "=%.78s\n\n", "==============================================================================="); } void run_benchmark(const BenchmarkConfig& config) { // Allocate matrices printf("Allocating matrices...\n"); // X matrix (i2 format): nc x n, but stored as nc x (n/4) bytes uint8_t* X = (uint8_t*)malloc(config.nc * config.n / 4); // Y matrix (i8 format): nr x n int8_t* Y = (int8_t*)malloc(config.nr * config.n); // Result matrix (float32): nr x nc float* S = (float*)malloc(config.nr * config.nc * sizeof(float)); if (!X || !Y || !S) { fprintf(stderr, "Failed to allocate memory\n"); exit(1); } // Initialize matrices with random data printf("Initializing matrices with random data...\n"); srand(time(NULL)); init_matrix_i2(X, config.n, config.nc); init_matrix_i8(Y, config.n, config.nr); memset(S, 0, config.nr * config.nc * sizeof(float)); // Warmup printf("Running %d warmup iterations...\n", config.warmup); for (int i = 0; i < config.warmup; i++) { ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc); } // Benchmark printf("Running %d benchmark iterations...\n", config.iterations); double total_time = 0.0; double min_time = 1e20; double max_time = 0.0; for (int i = 0; i < config.iterations; i++) { double start = get_time_ns(); ggml_gemm_i2_i8_s(config.n, S, config.nc, X, Y, config.nr, config.nc); double end = get_time_ns(); double elapsed = end - start; total_time += elapsed; if (elapsed < min_time) min_time = elapsed; if (elapsed > max_time) max_time = elapsed; if ((i + 1) % 100 == 0) { printf(" Progress: %d/%d iterations\n", i + 1, config.iterations); } } // Calculate statistics double avg_time_ns = total_time / config.iterations; double avg_time_ms = avg_time_ns / 1e6; double min_time_ms = min_time / 1e6; double max_time_ms = max_time / 1e6; // Calculate GFLOPS // For GEMM: nr x nc x n multiply-adds = 2 * nr * nc * n FLOPs double flops = 2.0 * config.nr * config.nc * config.n; double gflops = (flops / avg_time_ns); // Calculate throughput (tokens/s assuming each column is a token) double throughput = (config.nc * 1e9) / avg_time_ns; // Print results printf("\n"); printf("=" "=%.78s\n", "==============================================================================="); printf("Benchmark Results:\n"); printf("=" "=%.78s\n", "==============================================================================="); printf(" Average time : %.3f ms\n", avg_time_ms); printf(" Min time : %.3f ms\n", min_time_ms); printf(" Max time : %.3f ms\n", max_time_ms); printf(" Std dev : %.3f ms\n", sqrt((max_time_ms - min_time_ms) * (max_time_ms - min_time_ms) / 12)); printf("\nPerformance:\n"); printf(" GFLOPS : %.2f\n", gflops); printf(" Throughput : %.2f tokens/s\n", throughput); printf(" Latency/token : %.3f us\n", (avg_time_ms * 1000) / config.nc); printf("=" "=%.78s\n", "==============================================================================="); // Cleanup free(X); free(Y); free(S); } void print_usage(const char* program) { printf("Usage: %s [options]\n", program); printf("Options:\n"); printf(" -n Embedding dimension (must be divisible by 4, default: 2048)\n"); printf(" -r Number of rows in matrix Y (default: 32)\n"); printf(" -c Number of columns in matrix X (default: 128)\n"); printf(" -i Number of iterations (default: 1000)\n"); printf(" -w Number of warmup iterations (default: 10)\n"); printf(" -h Show this help message\n"); } int main(int argc, char** argv) { BenchmarkConfig config = { .n = 2048, .nr = 32, .nc = 128, .iterations = 1000, .warmup = 10 }; // Parse command line arguments for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-n") == 0 && i + 1 < argc) { config.n = atoi(argv[++i]); } else if (strcmp(argv[i], "-r") == 0 && i + 1 < argc) { config.nr = atoi(argv[++i]); } else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) { config.nc = atoi(argv[++i]); } else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) { config.iterations = atoi(argv[++i]); } else if (strcmp(argv[i], "-w") == 0 && i + 1 < argc) { config.warmup = atoi(argv[++i]); } else if (strcmp(argv[i], "-h") == 0) { print_usage(argv[0]); return 0; } else { fprintf(stderr, "Unknown option: %s\n", argv[i]); print_usage(argv[0]); return 1; } } // Validate configuration if (config.n % 4 != 0) { fprintf(stderr, "Error: Embedding dimension (-n) must be divisible by 4\n"); return 1; } if (config.n <= 0 || config.nr <= 0 || config.nc <= 0 || config.iterations <= 0) { fprintf(stderr, "Error: All size parameters must be positive\n"); return 1; } // Run benchmark print_config(config); run_benchmark(config); return 0; }