Merge pull request #137 from younesbelkada/f3-changes

Feat: Add changes for Falcon3 release
2026-05-04 03:40:50 +00:00 · 2024-12-18 11:41:20 +08:00
parent bf11a49f11 6a5134a6f0
commit 33ceabed0b
5 changed files with 40 additions and 13 deletions
@@ -130,21 +130,21 @@ pip install -r requirements.txt
 3. Build the project
 ```bash
 # Download the model from Hugging Face, convert it to quantized gguf format, and build the project
-python setup_env.py --hf-repo HF1BitLLM/Llama3-8B-1.58-100B-tokens -q i2_s
+python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s

 # Or you can manually download the model and run with local path
-huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir models/Llama3-8B-1.58-100B-tokens
-python setup_env.py -md models/Llama3-8B-1.58-100B-tokens -q i2_s
+huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit
+python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s
 ```
 <pre>
-usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
+usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
                    [--use-pretuned]

 Setup the environment for running inference

 optional arguments:
  -h, --help            show this help message and exit
-  --hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}
+  --hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}
                        Model used for inference
  --model-dir MODEL_DIR, -md MODEL_DIR
                        Directory to save/load the model
@@ -159,7 +159,7 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0
+python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"

 # Output:
 # Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
@@ -30,7 +30,8 @@ def run_inference():
        '-ngl', '0',
        '-c', str(args.ctx_size),
        '--temp', str(args.temperature),
-        "-b", "1"
+        "-b", "1",
+        "-cnv" if args.conversation else ""
    ]
    run_command(command)

@@ -48,6 +49,7 @@ if __name__ == "__main__":
    parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
    parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048)
    parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8)
+    parser.add_argument("-cnv", "--conversation", action='store_true', help="Whether to enable chat mode or not (for instruct models.)")

    args = parser.parse_args()
    run_inference()
@@ -19,7 +19,28 @@ SUPPORTED_HF_MODELS = {
    },
    "HF1BitLLM/Llama3-8B-1.58-100B-tokens": {
        "model_name": "Llama3-8B-1.58-100B-tokens",
-    }
+    },
+    "tiiuae/Falcon3-7B-Instruct-1.58bit": {
+        "model_name": "Falcon3-7B-1.58bit",
+    },
+    "tiiuae/Falcon3-7B-1.58bit": {
+        "model_name": "Falcon3-7B-1.58bit",
+    },
+    "tiiuae/Falcon3-10B-Instruct-1.58bit": {
+        "model_name": "Falcon3-10B-1.58bit",
+    },
+    "tiiuae/Falcon3-10B-1.58bit": {
+        "model_name": "Falcon3-10B-1.58bit",
+    },
+    "tiiuae/Falcon3-3B-Instruct-1.58bit": {
+        "model_name": "Falcon3-3B-1.58bit",
+    },
+    "tiiuae/Falcon3-3B-1.58bit": {
+        "model_name": "Falcon3-3B-1.58bit",
+    },
+    "tiiuae/Falcon3-1B-Instruct-1.58bit": {
+        "model_name": "Falcon3-1B-1.58bit",
+    },
 }

 SUPPORTED_QUANT_TYPES = {
@@ -133,7 +154,7 @@ def gen_code():
                shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
        if get_model_name() == "bitnet_b1_58-large":
            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
-        elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
+        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
        elif get_model_name() == "bitnet_b1_58-3B":
            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
@@ -149,7 +170,7 @@ def gen_code():
            shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
        if get_model_name() == "bitnet_b1_58-large":
            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
-        elif get_model_name() == "Llama3-8B-1.58-100B-tokens":
+        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
        elif get_model_name() == "bitnet_b1_58-3B":
            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
@@ -272,7 +272,10 @@ class Model(ABC):
                tokens.append(f"[PAD{i}]")
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
+                # We need to manually encode and decode the added tokens in case special characters
+                # used for `\n` / `\t` have been manually added in the added tokens
+                encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i]))
+                tokens.append(encoded_decoded_token)
                if tokenizer.added_tokens_decoder[i].special:
                    toktypes.append(gguf.TokenType.CONTROL)
                else:
@@ -280,7 +283,6 @@ class Model(ABC):
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
-
        return tokens, toktypes, tokpre

    # NOTE: this function is generated by convert-hf-to-gguf-update.py
@@ -335,6 +337,8 @@ class Model(ABC):
        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
            res = "command-r"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            res = "falcon3"

        if res is None:
            logger.warning("\n")