diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 957b59d..5095a95 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 957b59d2207370cd5061dd1bb12d079aa267fbab +Subproject commit 5095a956646d2143362eecbc78800fe4f6e70007 diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py index 4d94081..f0d7d36 100644 --- a/utils/codegen_tl2.py +++ b/utils/codegen_tl2.py @@ -5,6 +5,7 @@ from configparser import ConfigParser def gen_ctor_code(): kernel_code = "\n\ #include \"ggml-bitnet.h\"\n\ +#include \"ggml-cpu-impl.h\"\n\ #include \n\ #include \n\ #define GGML_BITNET_MAX_NODES 8192\n\ @@ -105,7 +106,7 @@ inline int32_t partial_max_reset(int32_t bs, void* lut_scales_) {\n\ template\n\ inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\ #if defined __AVX2__\n\ - __m256i vec_lut[16];\n\ + __m256 vec_lut[16];\n\ const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0);\n\ float scales = *lut_scales;\n\ __m256i shuffle_mask = _mm256_set_epi8(\n\ @@ -191,7 +192,7 @@ inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_t template\n\ inline int32_t two_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\ #if defined __AVX2__\n\ - __m256i vec_lut[16];\n\ + __m256 vec_lut[16];\n\ const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0);\n\ float scales = *lut_scales;\n\ __m256i shuffle_mask = _mm256_set_epi8(\n\ @@ -623,7 +624,7 @@ def gen_top_api(kernel_shapes, k_list): kernel_code = "".join([kernel_code, "}\n"]) return kernel_code -def gen_transform_code(kernel_shapes): +def gen_transform_code(kernel_shapes, fp16): kernel_code = "\n\ void ggml_bitnet_transform_tensor(struct ggml_tensor * tensor) {\n\ if (!(is_type_supported(tensor->type) && tensor->backend == GGML_BACKEND_TYPE_CPU && tensor->extra == nullptr)) {\n\ @@ -657,10 +658,20 @@ void ggml_bitnet_transform_tensor(struct ggml_tensor * tensor) {\n\ scales = (bitnet_float_type *) aligned_malloc(sizeof(bitnet_float_type));\n\ qweights = (uint8_t *) tensor->data;\n\ int nbytes = (k - 256) * m / 3 * 5 / 8 + 256 * m / 2 * 4 / 8;\n\ - if (nbytes % 32 != 0) nbytes = 32 - nbytes % 32 + nbytes;\n\ + nbytes = 32 - nbytes % 32 + nbytes;\n\ float * i2_scales = (float * )(qweights + nbytes);\n\ - scales[0] = (bitnet_float_type) i2_scales[0];\n\ -\n\ +\n"]) + + if fp16: + kernel_code = "".join([kernel_code, "\ + ggml_fp16_t* fp16_scale = (ggml_fp16_t *)aligned_malloc(sizeof(ggml_fp16_t));\n\ + fp16_scale[0] = GGML_FP32_TO_FP16(i2_scales[0]);\n\ + scales[0] = (bitnet_float_type) GGML_FP16_TO_FP32(fp16_scale[0]);\n"]) + else: + kernel_code = "".join([kernel_code, "\ + scales[0] = (bitnet_float_type) i2_scales[0];\n"]) + + kernel_code = "".join([kernel_code, "\n\ tensor->extra = bitnet_tensor_extras + bitnet_tensor_extras_index;\n\ bitnet_tensor_extras[bitnet_tensor_extras_index++] = {\n\ /* .lut_scales_size = */ lut_scales_size,\n\ @@ -702,6 +713,7 @@ if __name__ == "__main__": help="block length when cutting one weight (M, K) into K / BK weights (M, BK).") parser.add_argument('--bm',default="input", type=str, help="using simd instructions to compute (bm, 192 / bm) in one block") + parser.add_argument('--fp16', action="store_true", help="convert scale to fp16") args = parser.parse_args() kernel_shapes = ModelShapeDict[args.model] @@ -730,7 +742,7 @@ if __name__ == "__main__": ctor_code = gen_ctor_code() api_code = gen_top_api(kernel_shapes, k_list) - trans_code = gen_transform_code(kernel_shapes) + trans_code = gen_transform_code(kernel_shapes, args.fp16) output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "include")