mirror of
https://github.com/microsoft/BitNet.git
synced 2026-05-04 03:40:50 +00:00
Merge pull request #137 from younesbelkada/f3-changes
Feat: Add changes for Falcon3 release
This commit is contained in:
Vendored
+1
-1
Submodule 3rdparty/llama.cpp updated: 814d0ee544...2ce8604036
@@ -130,21 +130,21 @@ pip install -r requirements.txt
|
||||
3. Build the project
|
||||
```bash
|
||||
# Download the model from Hugging Face, convert it to quantized gguf format, and build the project
|
||||
python setup_env.py --hf-repo HF1BitLLM/Llama3-8B-1.58-100B-tokens -q i2_s
|
||||
python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s
|
||||
|
||||
# Or you can manually download the model and run with local path
|
||||
huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir models/Llama3-8B-1.58-100B-tokens
|
||||
python setup_env.py -md models/Llama3-8B-1.58-100B-tokens -q i2_s
|
||||
huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit
|
||||
python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s
|
||||
```
|
||||
<pre>
|
||||
usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
|
||||
usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
|
||||
[--use-pretuned]
|
||||
|
||||
Setup the environment for running inference
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}
|
||||
--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}
|
||||
Model used for inference
|
||||
--model-dir MODEL_DIR, -md MODEL_DIR
|
||||
Directory to save/load the model
|
||||
@@ -159,7 +159,7 @@ optional arguments:
|
||||
### Basic usage
|
||||
```bash
|
||||
# Run inference with the quantized model
|
||||
python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0
|
||||
python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
|
||||
|
||||
# Output:
|
||||
# Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
|
||||
|
||||
+3
-1
@@ -30,7 +30,8 @@ def run_inference():
|
||||
'-ngl', '0',
|
||||
'-c', str(args.ctx_size),
|
||||
'--temp', str(args.temperature),
|
||||
"-b", "1"
|
||||
"-b", "1",
|
||||
"-cnv" if args.conversation else ""
|
||||
]
|
||||
run_command(command)
|
||||
|
||||
@@ -48,6 +49,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
|
||||
parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048)
|
||||
parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8)
|
||||
parser.add_argument("-cnv", "--conversation", action='store_true', help="Whether to enable chat mode or not (for instruct models.)")
|
||||
|
||||
args = parser.parse_args()
|
||||
run_inference()
|
||||
+24
-3
@@ -19,7 +19,28 @@ SUPPORTED_HF_MODELS = {
|
||||
},
|
||||
"HF1BitLLM/Llama3-8B-1.58-100B-tokens": {
|
||||
"model_name": "Llama3-8B-1.58-100B-tokens",
|
||||
}
|
||||
},
|
||||
"tiiuae/Falcon3-7B-Instruct-1.58bit": {
|
||||
"model_name": "Falcon3-7B-1.58bit",
|
||||
},
|
||||
"tiiuae/Falcon3-7B-1.58bit": {
|
||||
"model_name": "Falcon3-7B-1.58bit",
|
||||
},
|
||||
"tiiuae/Falcon3-10B-Instruct-1.58bit": {
|
||||
"model_name": "Falcon3-10B-1.58bit",
|
||||
},
|
||||
"tiiuae/Falcon3-10B-1.58bit": {
|
||||
"model_name": "Falcon3-10B-1.58bit",
|
||||
},
|
||||
"tiiuae/Falcon3-3B-Instruct-1.58bit": {
|
||||
"model_name": "Falcon3-3B-1.58bit",
|
||||
},
|
||||
"tiiuae/Falcon3-3B-1.58bit": {
|
||||
"model_name": "Falcon3-3B-1.58bit",
|
||||
},
|
||||
"tiiuae/Falcon3-1B-Instruct-1.58bit": {
|
||||
"model_name": "Falcon3-1B-1.58bit",
|
||||
},
|
||||
}
|
||||
|
||||
SUPPORTED_QUANT_TYPES = {
|
||||
@@ -133,7 +154,7 @@ def gen_code():
|
||||
shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
|
||||
if get_model_name() == "bitnet_b1_58-large":
|
||||
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
|
||||
elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
|
||||
elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
|
||||
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
|
||||
elif get_model_name() == "bitnet_b1_58-3B":
|
||||
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
|
||||
@@ -149,7 +170,7 @@ def gen_code():
|
||||
shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
|
||||
if get_model_name() == "bitnet_b1_58-large":
|
||||
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
|
||||
elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
|
||||
elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
|
||||
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
|
||||
elif get_model_name() == "bitnet_b1_58-3B":
|
||||
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
|
||||
|
||||
@@ -272,7 +272,10 @@ class Model(ABC):
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
# We need to manually encode and decode the added tokens in case special characters
|
||||
# used for `\n` / `\t` have been manually added in the added tokens
|
||||
encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i]))
|
||||
tokens.append(encoded_decoded_token)
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
@@ -280,7 +283,6 @@ class Model(ABC):
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
return tokens, toktypes, tokpre
|
||||
|
||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
@@ -335,6 +337,8 @@ class Model(ABC):
|
||||
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
|
||||
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
|
||||
res = "command-r"
|
||||
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
||||
res = "falcon3"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
||||
Reference in New Issue
Block a user