fix weird character issue

This commit is contained in:
younesbelkada
2024-11-23 14:49:44 +00:00
parent 22575a47cf
commit 7c57a5ae20
+5 -3
View File
@@ -272,7 +272,10 @@ class Model(ABC):
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
# We need to manually encode and decode the added tokens in case special characters
# used for `\n` / `\t` have been manually added in the added tokens
encoded_decoded_token = tokenizer.decode(tokenizer.encode(reverse_vocab[i]))
tokens.append(encoded_decoded_token)
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
@@ -280,7 +283,6 @@ class Model(ABC):
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
return tokens, toktypes, tokpre
# NOTE: this function is generated by convert-hf-to-gguf-update.py
@@ -674,7 +676,7 @@ def read_model_config(model_dir: str) -> dict[str, Any]:
with open(config, "r") as f:
return json.load(f)
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "Falcon3ForCausalLM")
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA