From 18cfa8af892cb43d7d5df459024f8f5fc1556161 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 14 Nov 2024 14:51:09 +0000 Subject: [PATCH 1/9] add fc3 support --- setup_env.py | 5 ++++- utils/convert-hf-to-gguf-bitnet.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/setup_env.py b/setup_env.py index 8a9c4b4..ab4beb9 100644 --- a/setup_env.py +++ b/setup_env.py @@ -19,6 +19,9 @@ SUPPORTED_HF_MODELS = { }, "HF1BitLLM/Llama3-8B-1.58-100B-tokens": { "model_name": "Llama3-8B-1.58-100B-tokens", + }, + "tiiuae/falcon3-7b-1.58bit": { + "model_name": "falcon3-7b-1.58bit", } } @@ -149,7 +152,7 @@ def gen_code(): shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h") if get_model_name() == "bitnet_b1_58-large": run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen") - elif get_model_name() == "Llama3-8B-1.58-100B-tokens": + elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "falcon3-7b-1.58bit"]: run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen") elif get_model_name() == "bitnet_b1_58-3B": run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen") diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py index 55b27ae..5621126 100644 --- a/utils/convert-hf-to-gguf-bitnet.py +++ b/utils/convert-hf-to-gguf-bitnet.py @@ -335,6 +335,8 @@ class Model(ABC): if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 res = "command-r" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + res = "falcon3" if res is None: logger.warning("\n") From c1892d6818ed59e7afe8d63e1f926bab7e6ad83e Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 14 Nov 2024 14:53:43 +0000 Subject: [PATCH 2/9] updated submodule --- 3rdparty/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 814d0ee..2ce8604 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 814d0ee5440495255a4e3a5a8abf001b27b539d4 +Subproject commit 2ce86040364799c44a48bb5a8407351812045dc6 From 22575a47cf0f41e5e5f958996868a5e6fe43679f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 14 Nov 2024 15:08:47 +0000 Subject: [PATCH 3/9] update submodule --- .gitmodules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 2b36e49..d1146c9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "3rdparty/llama.cpp"] path = 3rdparty/llama.cpp - url = https://github.com/Eddie-Wang1120/llama.cpp.git - branch = merge-dev + url = https://github.com/tiiuae/llama.cpp-internal.git + branch = fc3 From 7c57a5ae20dd8e99bd03732f490757a70a578913 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sat, 23 Nov 2024 14:49:44 +0000 Subject: [PATCH 4/9] fix weird character issue --- utils/convert-hf-to-gguf-bitnet.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py index 5621126..8faa51e 100644 --- a/utils/convert-hf-to-gguf-bitnet.py +++ b/utils/convert-hf-to-gguf-bitnet.py @@ -272,7 +272,10 @@ class Model(ABC): tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) + # We need to manually encode and decode the added tokens in case special characters + # used for `\n` / `\t` have been manually added in the added tokens + encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i])) + tokens.append(encoded_decoded_token) if tokenizer.added_tokens_decoder[i].special: toktypes.append(gguf.TokenType.CONTROL) else: @@ -280,7 +283,6 @@ class Model(ABC): else: tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) - return tokens, toktypes, tokpre # NOTE: this function is generated by convert-hf-to-gguf-update.py @@ -674,7 +676,7 @@ def read_model_config(model_dir: str) -> dict[str, Any]: with open(config, "r") as f: return json.load(f) -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "Falcon3ForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA From a838911a55840943d974ad35330d7717cc14f14d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 9 Dec 2024 16:45:31 +0000 Subject: [PATCH 5/9] more changes to support chat models --- run_inference.py | 4 +++- setup_env.py | 3 +++ utils/convert-hf-to-gguf-bitnet.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/run_inference.py b/run_inference.py index fe14e0e..7c279cd 100644 --- a/run_inference.py +++ b/run_inference.py @@ -30,7 +30,8 @@ def run_inference(): '-ngl', '0', '-c', str(args.ctx_size), '--temp', str(args.temperature), - "-b", "1" + "-b", "1", + "-cnv" if args.cnv else "" ] run_command(command) @@ -48,6 +49,7 @@ if __name__ == "__main__": parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2) parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048) parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8) + parser.add_argument("-cnv", "--conversation", ction='store_true', help="Whether to enable chat mode or not (for instruct models.)") args = parser.parse_args() run_inference() \ No newline at end of file diff --git a/setup_env.py b/setup_env.py index ab4beb9..1ea456b 100644 --- a/setup_env.py +++ b/setup_env.py @@ -20,6 +20,9 @@ SUPPORTED_HF_MODELS = { "HF1BitLLM/Llama3-8B-1.58-100B-tokens": { "model_name": "Llama3-8B-1.58-100B-tokens", }, + "tiiuae/falcon3-7b-instruct-1.58bit": { + "model_name": "falcon3-7b-1.58bit", + }, "tiiuae/falcon3-7b-1.58bit": { "model_name": "falcon3-7b-1.58bit", } diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py index 8faa51e..f525f58 100644 --- a/utils/convert-hf-to-gguf-bitnet.py +++ b/utils/convert-hf-to-gguf-bitnet.py @@ -676,7 +676,7 @@ def read_model_config(model_dir: str) -> dict[str, Any]: with open(config, "r") as f: return json.load(f) -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "Falcon3ForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA From de19627eef066b16faac78a0358891a630543975 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 16 Dec 2024 14:42:15 +0000 Subject: [PATCH 6/9] add 10b model --- setup_env.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/setup_env.py b/setup_env.py index 1ea456b..8440929 100644 --- a/setup_env.py +++ b/setup_env.py @@ -20,12 +20,27 @@ SUPPORTED_HF_MODELS = { "HF1BitLLM/Llama3-8B-1.58-100B-tokens": { "model_name": "Llama3-8B-1.58-100B-tokens", }, - "tiiuae/falcon3-7b-instruct-1.58bit": { - "model_name": "falcon3-7b-1.58bit", + "tiiuae/Falcon3-7B-Instruct-1.58bit": { + "model_name": "Falcon3-7B-1.58bit", + }, + "tiiuae/Falcon3-7B-1.58bit": { + "model_name": "Falcon3-7B-1.58bit", + }, + "tiiuae/Falcon3-10B-Instruct-1.58bit": { + "model_name": "Falcon3-10B-1.58bit", + }, + "tiiuae/Falcon3-10B-1.58bit": { + "model_name": "Falcon3-10B-1.58bit", + }, + "tiiuae/Falcon3-3B-Instruct-1.58bit": { + "model_name": "Falcon3-3B-1.58bit", + }, + "tiiuae/Falcon3-3B-1.58bit": { + "model_name": "Falcon3-3B-1.58bit", + }, + "tiiuae/Falcon3-1B-Instruct-1.58bit": { + "model_name": "Falcon3-1B-1.58bit", }, - "tiiuae/falcon3-7b-1.58bit": { - "model_name": "falcon3-7b-1.58bit", - } } SUPPORTED_QUANT_TYPES = { @@ -139,7 +154,7 @@ def gen_code(): shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini") if get_model_name() == "bitnet_b1_58-large": run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen") - elif get_model_name() == "Llama3-8B-1.58-100B-tokens": + elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]: run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen") elif get_model_name() == "bitnet_b1_58-3B": run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen") @@ -155,7 +170,7 @@ def gen_code(): shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h") if get_model_name() == "bitnet_b1_58-large": run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen") - elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "falcon3-7b-1.58bit"]: + elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]: run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen") elif get_model_name() == "bitnet_b1_58-3B": run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen") From c1c55417c230362a74bb36824b72c37dee6f4af0 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 16 Dec 2024 15:26:33 +0000 Subject: [PATCH 7/9] fix issues --- run_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_inference.py b/run_inference.py index 7c279cd..75a7246 100644 --- a/run_inference.py +++ b/run_inference.py @@ -31,7 +31,7 @@ def run_inference(): '-c', str(args.ctx_size), '--temp', str(args.temperature), "-b", "1", - "-cnv" if args.cnv else "" + "-cnv" if args.conversation else "" ] run_command(command) @@ -49,7 +49,7 @@ if __name__ == "__main__": parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2) parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048) parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8) - parser.add_argument("-cnv", "--conversation", ction='store_true', help="Whether to enable chat mode or not (for instruct models.)") + parser.add_argument("-cnv", "--conversation", action='store_true', help="Whether to enable chat mode or not (for instruct models.)") args = parser.parse_args() run_inference() \ No newline at end of file From 85c32473230f5f6d483cb8f9756743a7d98c26c5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 17 Dec 2024 07:05:35 +0000 Subject: [PATCH 8/9] add changes on README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4bbfc6a..4739214 100644 --- a/README.md +++ b/README.md @@ -130,21 +130,21 @@ pip install -r requirements.txt 3. Build the project ```bash # Download the model from Hugging Face, convert it to quantized gguf format, and build the project -python setup_env.py --hf-repo HF1BitLLM/Llama3-8B-1.58-100B-tokens -q i2_s +python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s # Or you can manually download the model and run with local path -huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir models/Llama3-8B-1.58-100B-tokens -python setup_env.py -md models/Llama3-8B-1.58-100B-tokens -q i2_s +huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit +python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s ```
-usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
+usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
                     [--use-pretuned]
 
 Setup the environment for running inference
 
 optional arguments:
   -h, --help            show this help message and exit
-  --hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}
+  --hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}
                         Model used for inference
   --model-dir MODEL_DIR, -md MODEL_DIR
                         Directory to save/load the model
@@ -159,7 +159,7 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0
+python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
 
 # Output:
 # Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?

From 6a5134a6f0355deeb99a24b98c714c0fd4eccfde Mon Sep 17 00:00:00 2001
From: younesbelkada 
Date: Tue, 17 Dec 2024 08:45:25 +0000
Subject: [PATCH 9/9] change

---
 .gitmodules | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index d1146c9..2b36e49 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "3rdparty/llama.cpp"]
 	path = 3rdparty/llama.cpp
-	url = https://github.com/tiiuae/llama.cpp-internal.git
-	branch = fc3
+	url = https://github.com/Eddie-Wang1120/llama.cpp.git
+	branch = merge-dev