From 7c57a5ae20dd8e99bd03732f490757a70a578913 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sat, 23 Nov 2024 14:49:44 +0000 Subject: [PATCH] fix weird character issue --- utils/convert-hf-to-gguf-bitnet.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py index 5621126..8faa51e 100644 --- a/utils/convert-hf-to-gguf-bitnet.py +++ b/utils/convert-hf-to-gguf-bitnet.py @@ -272,7 +272,10 @@ class Model(ABC): tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) + # We need to manually encode and decode the added tokens in case special characters + # used for `\n` / `\t` have been manually added in the added tokens + encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i])) + tokens.append(encoded_decoded_token) if tokenizer.added_tokens_decoder[i].special: toktypes.append(gguf.TokenType.CONTROL) else: @@ -280,7 +283,6 @@ class Model(ABC): else: tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) - return tokens, toktypes, tokpre # NOTE: this function is generated by convert-hf-to-gguf-update.py @@ -674,7 +676,7 @@ def read_model_config(model_dir: str) -> dict[str, Any]: with open(config, "r") as f: return json.load(f) -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "Falcon3ForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA