diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 814d0ee..2ce8604 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 814d0ee5440495255a4e3a5a8abf001b27b539d4 +Subproject commit 2ce86040364799c44a48bb5a8407351812045dc6 diff --git a/README.md b/README.md index 4bbfc6a..4739214 100644 --- a/README.md +++ b/README.md @@ -130,21 +130,21 @@ pip install -r requirements.txt 3. Build the project ```bash # Download the model from Hugging Face, convert it to quantized gguf format, and build the project -python setup_env.py --hf-repo HF1BitLLM/Llama3-8B-1.58-100B-tokens -q i2_s +python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s # Or you can manually download the model and run with local path -huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir models/Llama3-8B-1.58-100B-tokens -python setup_env.py -md models/Llama3-8B-1.58-100B-tokens -q i2_s +huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit +python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s ```
-usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
+usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
                     [--use-pretuned]
 
 Setup the environment for running inference
 
 optional arguments:
   -h, --help            show this help message and exit
-  --hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}
+  --hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}
                         Model used for inference
   --model-dir MODEL_DIR, -md MODEL_DIR
                         Directory to save/load the model
@@ -159,7 +159,7 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0
+python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
 
 # Output:
 # Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
diff --git a/run_inference.py b/run_inference.py
index fe14e0e..75a7246 100644
--- a/run_inference.py
+++ b/run_inference.py
@@ -30,7 +30,8 @@ def run_inference():
         '-ngl', '0',
         '-c', str(args.ctx_size),
         '--temp', str(args.temperature),
-        "-b", "1"
+        "-b", "1",
+        "-cnv" if args.conversation else ""
     ]
     run_command(command)
 
@@ -48,6 +49,7 @@ if __name__ == "__main__":
     parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
     parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048)
     parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8)
+    parser.add_argument("-cnv", "--conversation", action='store_true', help="Whether to enable chat mode or not (for instruct models.)")
 
     args = parser.parse_args()
     run_inference()
\ No newline at end of file
diff --git a/setup_env.py b/setup_env.py
index 8a9c4b4..8440929 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -19,7 +19,28 @@ SUPPORTED_HF_MODELS = {
     },
     "HF1BitLLM/Llama3-8B-1.58-100B-tokens": {
         "model_name": "Llama3-8B-1.58-100B-tokens",
-    }
+    },
+    "tiiuae/Falcon3-7B-Instruct-1.58bit": {
+        "model_name": "Falcon3-7B-1.58bit",
+    },
+    "tiiuae/Falcon3-7B-1.58bit": {
+        "model_name": "Falcon3-7B-1.58bit",
+    },
+    "tiiuae/Falcon3-10B-Instruct-1.58bit": {
+        "model_name": "Falcon3-10B-1.58bit",
+    },
+    "tiiuae/Falcon3-10B-1.58bit": {
+        "model_name": "Falcon3-10B-1.58bit",
+    },
+    "tiiuae/Falcon3-3B-Instruct-1.58bit": {
+        "model_name": "Falcon3-3B-1.58bit",
+    },
+    "tiiuae/Falcon3-3B-1.58bit": {
+        "model_name": "Falcon3-3B-1.58bit",
+    },
+    "tiiuae/Falcon3-1B-Instruct-1.58bit": {
+        "model_name": "Falcon3-1B-1.58bit",
+    },
 }
 
 SUPPORTED_QUANT_TYPES = {
@@ -133,7 +154,7 @@ def gen_code():
                 shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
         if get_model_name() == "bitnet_b1_58-large":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
-        elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
+        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
@@ -149,7 +170,7 @@ def gen_code():
             shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
         if get_model_name() == "bitnet_b1_58-large":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
-        elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
+        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py
index 55b27ae..f525f58 100644
--- a/utils/convert-hf-to-gguf-bitnet.py
+++ b/utils/convert-hf-to-gguf-bitnet.py
@@ -272,7 +272,10 @@ class Model(ABC):
                 tokens.append(f"[PAD{i}]")
                 toktypes.append(gguf.TokenType.USER_DEFINED)
             elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
+                # We need to manually encode and decode the added tokens in case special characters
+                # used for `\n` / `\t` have been manually added in the added tokens
+                encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i]))
+                tokens.append(encoded_decoded_token)
                 if tokenizer.added_tokens_decoder[i].special:
                     toktypes.append(gguf.TokenType.CONTROL)
                 else:
@@ -280,7 +283,6 @@ class Model(ABC):
             else:
                 tokens.append(reverse_vocab[i])
                 toktypes.append(gguf.TokenType.NORMAL)
-
         return tokens, toktypes, tokpre
 
     # NOTE: this function is generated by convert-hf-to-gguf-update.py
@@ -335,6 +337,8 @@ class Model(ABC):
         if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
             # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
             res = "command-r"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            res = "falcon3"
 
         if res is None:
             logger.warning("\n")