diff --git a/README.md b/README.md index c69a3f1..798c0e9 100644 --- a/README.md +++ b/README.md @@ -292,6 +292,17 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile # Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128 ``` + +### Convert from `.safetensors` Checkpoints + +```sh +# Prepare the .safetensors model file +huggingface-cli download microsoft/bitnet-b1.58-2B-4T-bf16 --local-dir ./models/bitnet-b1.58-2B-4T-bf16 + +# Convert to gguf model +python ./utils/convert-helper-bitnet.py ./models/bitnet-b1.58-2B-4T-bf16 +``` + ### FAQ (Frequently Asked Questions)📌 #### Q1: The build dies with errors building llama.cpp due to issues with std::chrono in log.cpp? diff --git a/utils/convert-helper-bitnet.py b/utils/convert-helper-bitnet.py new file mode 100644 index 0000000..5b4149a --- /dev/null +++ b/utils/convert-helper-bitnet.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import sys +import os +import shutil +import subprocess +from pathlib import Path + +def run_command(command_list, cwd=None, check=True): + print(f"Executing: {' '.join(map(str, command_list))}") + try: + process = subprocess.run(command_list, cwd=cwd, check=check, capture_output=False, text=True) + return process + except subprocess.CalledProcessError as e: + print(f"Error executing command: {' '.join(map(str, e.cmd))}") + print(f"Return code: {e.returncode}") + raise + +def main(): + if len(sys.argv) < 2: + script_name = Path(sys.argv[0]).name + print(f"Usage: python {script_name} ") + sys.exit(1) + + model_dir_arg = sys.argv[1] + model_dir = Path(model_dir_arg).resolve() + + if not model_dir.is_dir(): + print(f"Error: Model directory '{model_dir}' not found or is not a directory.") + sys.exit(1) + + utils_dir = Path(__file__).parent.resolve() + project_root_dir = utils_dir.parent + + preprocess_script = utils_dir / "preprocess-huggingface-bitnet.py" + convert_script = utils_dir / "convert-ms-to-gguf-bitnet.py" + + llama_quantize_binary = project_root_dir / "build" / "bin" / "llama-quantize" + + input_file = model_dir / "model.safetensors" + input_backup_file = model_dir / "model.safetensors.backup" + preprocessed_output_file = model_dir / "model.safetensors" + + gguf_f32_output = model_dir / "ggml-model-f32-bitnet.gguf" + gguf_i2s_output = model_dir / "ggml-model-i2s-bitnet.gguf" + + if not preprocess_script.is_file(): + print(f"Error: Preprocess script not found at '{preprocess_script}'") + sys.exit(1) + if not convert_script.is_file(): + print(f"Error: Convert script not found at '{convert_script}'") + sys.exit(1) + if not llama_quantize_binary.is_file(): + print(f"Error: llama-quantize binary not found at '{llama_quantize_binary}'") + sys.exit(1) + + if not input_file.is_file(): + print(f"Error: Input safetensors file not found at '{input_file}'") + sys.exit(1) + + try: + print(f"Backing up '{input_file}' to '{input_backup_file}'") + if input_backup_file.exists(): + print(f"Warning: Removing existing backup file '{input_backup_file}'") + input_backup_file.unlink() + shutil.move(input_file, input_backup_file) + + print("Preprocessing huggingface checkpoint...") + cmd_preprocess = [ + sys.executable, + str(preprocess_script), + "--input", str(input_backup_file), + "--output", str(preprocessed_output_file) + ] + run_command(cmd_preprocess) + + print("Converting to GGUF (f32)...") + cmd_convert = [ + sys.executable, + str(convert_script), + str(model_dir), + "--vocab-type", "bpe", + "--outtype", "f32", + "--concurrency", "1", + "--outfile", str(gguf_f32_output) + ] + run_command(cmd_convert) + + print("Quantizing model to I2_S...") + cmd_quantize = [ + str(llama_quantize_binary), + str(gguf_f32_output), + str(gguf_i2s_output), + "I2_S", + "1" + ] + run_command(cmd_quantize) + + print("Convert successfully.") + + except Exception as e: + print(f"An error occurred: {e}") + finally: + print("Cleaning up intermediate files...") + if preprocessed_output_file.exists() and preprocessed_output_file != input_backup_file: + print(f"Removing preprocessed file: {preprocessed_output_file}") + try: + preprocessed_output_file.unlink() + except OSError as e: + print(f"Warning: Could not remove {preprocessed_output_file}: {e}") + + if gguf_f32_output.exists(): + print(f"Removing f32 GGUF: {gguf_f32_output}") + try: + gguf_f32_output.unlink() + except OSError as e: + print(f"Warning: Could not remove {gguf_f32_output}: {e}") + + if input_backup_file.exists(): + if not input_file.exists(): + print(f"Restoring original '{input_file}' from '{input_backup_file}'") + try: + shutil.move(input_backup_file, input_file) + except Exception as e: + print(f"Warning: Could not restore {input_file} from backup: {e}") + else: + print(f"Removing backup '{input_backup_file}' as original '{input_file}' should be present.") + try: + input_backup_file.unlink() + except OSError as e: + print(f"Warning: Could not remove backup {input_backup_file}: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py index 23a1a2c..e9e9162 100644 --- a/utils/convert-ms-to-gguf-bitnet.py +++ b/utils/convert-ms-to-gguf-bitnet.py @@ -1417,6 +1417,9 @@ class OutputFile: of = OutputFile(fname_out, endianess=endianess) + if 'bitnet' in of.gguf.arch: + svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" + # meta data of.add_meta_arch(params) if isinstance(vocab, Vocab): diff --git a/utils/preprocess-huggingface-bitnet.py b/utils/preprocess-huggingface-bitnet.py new file mode 100644 index 0000000..af75cd6 --- /dev/null +++ b/utils/preprocess-huggingface-bitnet.py @@ -0,0 +1,50 @@ +from safetensors import safe_open +from safetensors.torch import save_file +import torch + +def quant_weight_fp16(weight): + weight = weight.to(torch.float) + s = 1.0 / weight.abs().mean().clamp_(min=1e-5) + new_weight = (weight * s).round().clamp(-1, 1) / s + return new_weight + +def quant_model(input, output): + tensors = {} + + with safe_open(input, framework='pt') as f: + for name in f.keys(): + tensors[name] = f.get_tensor(name) + + keyword_list = [ + 'q_proj.weight', + 'k_proj.weight', + 'v_proj.weight', + 'o_proj.weight', + 'gate_proj.weight', + 'up_proj.weight', + 'down_proj.weight' + ] + + if any(keyword in name for keyword in keyword_list): + print(f'[INFO] Quantizing {name}') + tensors[name] = quant_weight_fp16(tensors[name]) + + print(f'[INFO] Saving to {output}\nThis may take a while.') + save_file(tensors, output) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Convert Safetensors back to Torch .pth checkpoint") + parser.add_argument( + "--input", type=str, required=True, + ) + parser.add_argument( + "--output", type=str, required=True, + ) + args = parser.parse_args() + + quant_model( + input=args.input, + output=args.output, + ) \ No newline at end of file