Merge pull request #137 from younesbelkada/f3-changes

Feat: Add changes for Falcon3 release
This commit is contained in:
tsong-ms
2024-12-18 11:41:20 +08:00
committed by GitHub
5 changed files with 40 additions and 13 deletions
+6 -6
View File
@@ -130,21 +130,21 @@ pip install -r requirements.txt
3. Build the project
```bash
# Download the model from Hugging Face, convert it to quantized gguf format, and build the project
python setup_env.py --hf-repo HF1BitLLM/Llama3-8B-1.58-100B-tokens -q i2_s
python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s
# Or you can manually download the model and run with local path
huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir models/Llama3-8B-1.58-100B-tokens
python setup_env.py -md models/Llama3-8B-1.58-100B-tokens -q i2_s
huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit
python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s
```
<pre>
usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
[--use-pretuned]
Setup the environment for running inference
optional arguments:
-h, --help show this help message and exit
--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}
--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}
Model used for inference
--model-dir MODEL_DIR, -md MODEL_DIR
Directory to save/load the model
@@ -159,7 +159,7 @@ optional arguments:
### Basic usage
```bash
# Run inference with the quantized model
python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0
python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
# Output:
# Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
+3 -1
View File
@@ -30,7 +30,8 @@ def run_inference():
'-ngl', '0',
'-c', str(args.ctx_size),
'--temp', str(args.temperature),
"-b", "1"
"-b", "1",
"-cnv" if args.conversation else ""
]
run_command(command)
@@ -48,6 +49,7 @@ if __name__ == "__main__":
parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048)
parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8)
parser.add_argument("-cnv", "--conversation", action='store_true', help="Whether to enable chat mode or not (for instruct models.)")
args = parser.parse_args()
run_inference()
+24 -3
View File
@@ -19,7 +19,28 @@ SUPPORTED_HF_MODELS = {
},
"HF1BitLLM/Llama3-8B-1.58-100B-tokens": {
"model_name": "Llama3-8B-1.58-100B-tokens",
}
},
"tiiuae/Falcon3-7B-Instruct-1.58bit": {
"model_name": "Falcon3-7B-1.58bit",
},
"tiiuae/Falcon3-7B-1.58bit": {
"model_name": "Falcon3-7B-1.58bit",
},
"tiiuae/Falcon3-10B-Instruct-1.58bit": {
"model_name": "Falcon3-10B-1.58bit",
},
"tiiuae/Falcon3-10B-1.58bit": {
"model_name": "Falcon3-10B-1.58bit",
},
"tiiuae/Falcon3-3B-Instruct-1.58bit": {
"model_name": "Falcon3-3B-1.58bit",
},
"tiiuae/Falcon3-3B-1.58bit": {
"model_name": "Falcon3-3B-1.58bit",
},
"tiiuae/Falcon3-1B-Instruct-1.58bit": {
"model_name": "Falcon3-1B-1.58bit",
},
}
SUPPORTED_QUANT_TYPES = {
@@ -133,7 +154,7 @@ def gen_code():
shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
if get_model_name() == "bitnet_b1_58-large":
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
elif get_model_name() == "bitnet_b1_58-3B":
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
@@ -149,7 +170,7 @@ def gen_code():
shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
if get_model_name() == "bitnet_b1_58-large":
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
elif get_model_name() == "bitnet_b1_58-3B":
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+6 -2
View File
@@ -272,7 +272,10 @@ class Model(ABC):
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
# We need to manually encode and decode the added tokens in case special characters
# used for `\n` / `\t` have been manually added in the added tokens
encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i]))
tokens.append(encoded_decoded_token)
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
@@ -280,7 +283,6 @@ class Model(ABC):
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
return tokens, toktypes, tokpre
# NOTE: this function is generated by convert-hf-to-gguf-update.py
@@ -335,6 +337,8 @@ class Model(ABC):
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
res = "falcon3"
if res is None:
logger.warning("\n")