mirror of
https://github.com/microsoft/BitNet.git
synced 2026-05-03 11:20:36 +00:00
Merge pull request #280 from microsoft/fix-convert-dev
Enable conversion from .safetensors checkpoints to gguf files
This commit is contained in:
@@ -292,6 +292,17 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile
|
||||
# Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate
|
||||
python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
|
||||
```
|
||||
|
||||
### Convert from `.safetensors` Checkpoints
|
||||
|
||||
```sh
|
||||
# Prepare the .safetensors model file
|
||||
huggingface-cli download microsoft/bitnet-b1.58-2B-4T-bf16 --local-dir ./models/bitnet-b1.58-2B-4T-bf16
|
||||
|
||||
# Convert to gguf model
|
||||
python ./utils/convert-helper-bitnet.py ./models/bitnet-b1.58-2B-4T-bf16
|
||||
```
|
||||
|
||||
### FAQ (Frequently Asked Questions)📌
|
||||
|
||||
#### Q1: The build dies with errors building llama.cpp due to issues with std::chrono in log.cpp?
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
def run_command(command_list, cwd=None, check=True):
|
||||
print(f"Executing: {' '.join(map(str, command_list))}")
|
||||
try:
|
||||
process = subprocess.run(command_list, cwd=cwd, check=check, capture_output=False, text=True)
|
||||
return process
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error executing command: {' '.join(map(str, e.cmd))}")
|
||||
print(f"Return code: {e.returncode}")
|
||||
raise
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
script_name = Path(sys.argv[0]).name
|
||||
print(f"Usage: python {script_name} <model-directory>")
|
||||
sys.exit(1)
|
||||
|
||||
model_dir_arg = sys.argv[1]
|
||||
model_dir = Path(model_dir_arg).resolve()
|
||||
|
||||
if not model_dir.is_dir():
|
||||
print(f"Error: Model directory '{model_dir}' not found or is not a directory.")
|
||||
sys.exit(1)
|
||||
|
||||
utils_dir = Path(__file__).parent.resolve()
|
||||
project_root_dir = utils_dir.parent
|
||||
|
||||
preprocess_script = utils_dir / "preprocess-huggingface-bitnet.py"
|
||||
convert_script = utils_dir / "convert-ms-to-gguf-bitnet.py"
|
||||
|
||||
llama_quantize_binary = project_root_dir / "build" / "bin" / "llama-quantize"
|
||||
|
||||
input_file = model_dir / "model.safetensors"
|
||||
input_backup_file = model_dir / "model.safetensors.backup"
|
||||
preprocessed_output_file = model_dir / "model.safetensors"
|
||||
|
||||
gguf_f32_output = model_dir / "ggml-model-f32-bitnet.gguf"
|
||||
gguf_i2s_output = model_dir / "ggml-model-i2s-bitnet.gguf"
|
||||
|
||||
if not preprocess_script.is_file():
|
||||
print(f"Error: Preprocess script not found at '{preprocess_script}'")
|
||||
sys.exit(1)
|
||||
if not convert_script.is_file():
|
||||
print(f"Error: Convert script not found at '{convert_script}'")
|
||||
sys.exit(1)
|
||||
if not llama_quantize_binary.is_file():
|
||||
print(f"Error: llama-quantize binary not found at '{llama_quantize_binary}'")
|
||||
sys.exit(1)
|
||||
|
||||
if not input_file.is_file():
|
||||
print(f"Error: Input safetensors file not found at '{input_file}'")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
print(f"Backing up '{input_file}' to '{input_backup_file}'")
|
||||
if input_backup_file.exists():
|
||||
print(f"Warning: Removing existing backup file '{input_backup_file}'")
|
||||
input_backup_file.unlink()
|
||||
shutil.move(input_file, input_backup_file)
|
||||
|
||||
print("Preprocessing huggingface checkpoint...")
|
||||
cmd_preprocess = [
|
||||
sys.executable,
|
||||
str(preprocess_script),
|
||||
"--input", str(input_backup_file),
|
||||
"--output", str(preprocessed_output_file)
|
||||
]
|
||||
run_command(cmd_preprocess)
|
||||
|
||||
print("Converting to GGUF (f32)...")
|
||||
cmd_convert = [
|
||||
sys.executable,
|
||||
str(convert_script),
|
||||
str(model_dir),
|
||||
"--vocab-type", "bpe",
|
||||
"--outtype", "f32",
|
||||
"--concurrency", "1",
|
||||
"--outfile", str(gguf_f32_output)
|
||||
]
|
||||
run_command(cmd_convert)
|
||||
|
||||
print("Quantizing model to I2_S...")
|
||||
cmd_quantize = [
|
||||
str(llama_quantize_binary),
|
||||
str(gguf_f32_output),
|
||||
str(gguf_i2s_output),
|
||||
"I2_S",
|
||||
"1"
|
||||
]
|
||||
run_command(cmd_quantize)
|
||||
|
||||
print("Convert successfully.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
finally:
|
||||
print("Cleaning up intermediate files...")
|
||||
if preprocessed_output_file.exists() and preprocessed_output_file != input_backup_file:
|
||||
print(f"Removing preprocessed file: {preprocessed_output_file}")
|
||||
try:
|
||||
preprocessed_output_file.unlink()
|
||||
except OSError as e:
|
||||
print(f"Warning: Could not remove {preprocessed_output_file}: {e}")
|
||||
|
||||
if gguf_f32_output.exists():
|
||||
print(f"Removing f32 GGUF: {gguf_f32_output}")
|
||||
try:
|
||||
gguf_f32_output.unlink()
|
||||
except OSError as e:
|
||||
print(f"Warning: Could not remove {gguf_f32_output}: {e}")
|
||||
|
||||
if input_backup_file.exists():
|
||||
if not input_file.exists():
|
||||
print(f"Restoring original '{input_file}' from '{input_backup_file}'")
|
||||
try:
|
||||
shutil.move(input_backup_file, input_file)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not restore {input_file} from backup: {e}")
|
||||
else:
|
||||
print(f"Removing backup '{input_backup_file}' as original '{input_file}' should be present.")
|
||||
try:
|
||||
input_backup_file.unlink()
|
||||
except OSError as e:
|
||||
print(f"Warning: Could not remove backup {input_backup_file}: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1417,6 +1417,9 @@ class OutputFile:
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
if 'bitnet' in of.gguf.arch:
|
||||
svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
if isinstance(vocab, Vocab):
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
import torch
|
||||
|
||||
def quant_weight_fp16(weight):
|
||||
weight = weight.to(torch.float)
|
||||
s = 1.0 / weight.abs().mean().clamp_(min=1e-5)
|
||||
new_weight = (weight * s).round().clamp(-1, 1) / s
|
||||
return new_weight
|
||||
|
||||
def quant_model(input, output):
|
||||
tensors = {}
|
||||
|
||||
with safe_open(input, framework='pt') as f:
|
||||
for name in f.keys():
|
||||
tensors[name] = f.get_tensor(name)
|
||||
|
||||
keyword_list = [
|
||||
'q_proj.weight',
|
||||
'k_proj.weight',
|
||||
'v_proj.weight',
|
||||
'o_proj.weight',
|
||||
'gate_proj.weight',
|
||||
'up_proj.weight',
|
||||
'down_proj.weight'
|
||||
]
|
||||
|
||||
if any(keyword in name for keyword in keyword_list):
|
||||
print(f'[INFO] Quantizing {name}')
|
||||
tensors[name] = quant_weight_fp16(tensors[name])
|
||||
|
||||
print(f'[INFO] Saving to {output}\nThis may take a while.')
|
||||
save_file(tensors, output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Convert Safetensors back to Torch .pth checkpoint")
|
||||
parser.add_argument(
|
||||
"--input", type=str, required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=str, required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
quant_model(
|
||||
input=args.input,
|
||||
output=args.output,
|
||||
)
|
||||
Reference in New Issue
Block a user