Add phi-2 tokenizer #7300

BramVanroy · 2024-05-15T10:56:29Z

This snippet yields an error:

python -c "
from huggingface_hub import snapshot_download;
snapshot_download(repo_id='microsoft/phi-2', local_dir='phi-2', local_dir_use_symlinks=False)
"
python convert-hf-to-gguf.py phi-2/ --outtype f16

Traceback (most recent call last):
  File "/home/local/vanroy/llama.cpp/convert-hf-to-gguf.py", line 3001, in <module>
    main()
  File "/home/local/vanroy/llama.cpp/convert-hf-to-gguf.py", line 2988, in main
    model_instance.set_vocab()
  File "/home/local/vanroy/llama.cpp/convert-hf-to-gguf.py", line 75, in set_vocab
    self._set_vocab_gpt2()
  File "/home/local/vanroy/llama.cpp/convert-hf-to-gguf.py", line 331, in _set_vocab_gpt2
    tokens, toktypes, tokpre = self.get_vocab_base()
  File "/home/local/vanroy/llama.cpp/convert-hf-to-gguf.py", line 242, in get_vocab_base
    tokpre = self.get_vocab_base_pre(tokenizer)
  File "/home/local/vanroy/llama.cpp/convert-hf-to-gguf.py", line 323, in get_vocab_base_pre
    raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
NotImplementedError: BPE pre-tokenizer was not recognized - update get_vocab_base_pre()

The proposed changes add support for phi-2, which uses CodeGenTokenizer, a BPE tokenizer.

closes #7022

linpan · 2024-05-17T03:22:20Z

raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

NotImplementedError: BPE pre-tokenizer was not recognized - update get_vocab_base_pre()

ggerganov · 2024-05-17T07:21:34Z

convert-hf-to-gguf.py

@@ -469,6 +469,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
            res = "jina-v2-de"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-2
+            res = "phi-2"


This new pre-tokenizer has to be handled in llama.cpp:

llama.cpp/llama.cpp

Lines 4414 to 4475 in e18bc6a

// for now, only BPE models have pre-tokenizers

if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {

if (tokenizer_pre.empty()) {

LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);

LLAMA_LOG_WARN("%s: \n", __func__);

LLAMA_LOG_WARN("%s: ************************************ \n", __func__);

LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);

LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);

LLAMA_LOG_WARN("%s: ************************************ \n", __func__);

LLAMA_LOG_WARN("%s: \n", __func__);

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

} else if (

tokenizer_pre == "default") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

} else if (

tokenizer_pre == "llama3" ||

tokenizer_pre == "llama-v3" ||

tokenizer_pre == "llama-bpe") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;

} else if (

tokenizer_pre == "deepseek-llm") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;

} else if (

tokenizer_pre == "deepseek-coder") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;

} else if (

tokenizer_pre == "falcon") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;

} else if (

tokenizer_pre == "mpt") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;

} else if (

tokenizer_pre == "starcoder") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;

} else if (

tokenizer_pre == "gpt-2" ||

tokenizer_pre == "jina-es" ||

tokenizer_pre == "jina-de" ||

tokenizer_pre == "jina-v2-es" ||

tokenizer_pre == "jina-v2-de") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;

} else if (

tokenizer_pre == "refact") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;

} else if (

tokenizer_pre == "command-r") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;

} else if (

tokenizer_pre == "qwen2") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;

} else if (

tokenizer_pre == "olmo") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;

} else if (

tokenizer_pre == "dbrx") {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;

} else {

throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));

}

} else {

vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

}

@ggerganov Thats not necessary. I solved this already in #7219 and ##7117.

turian · 2024-05-22T19:54:59Z

Hi @BramVanroy I was encouraging you in #7022 to test that HF and llama tokenization are identical. Here is a colab you could modify to try: https://colab.research.google.com/drive/1RYlEj2UhylYWyaASFo-LLATzZ8d29Z0T?usp=sharing

BramVanroy added 2 commits May 15, 2024 12:52

add phi 2 hash

0c6ae12

Update convert-hf-to-gguf-update.py

4ac22a8

mofosyne added model Model specific review complexity : medium Generally require more time to grok but manageable by beginner to medium expertise level labels May 15, 2024

ggerganov reviewed May 17, 2024

View reviewed changes

teleprint-me mentioned this pull request May 17, 2024

chore: Add model vocab support #7117

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add phi-2 tokenizer #7300

Add phi-2 tokenizer #7300

BramVanroy commented May 15, 2024

linpan commented May 17, 2024

ggerganov May 17, 2024

teleprint-me May 17, 2024 •

edited

turian commented May 22, 2024

	// for now, only BPE models have pre-tokenizers
	if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
	if (tokenizer_pre.empty()) {
	LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
	LLAMA_LOG_WARN("%s: \n", __func__);
	LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
	LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
	LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
	LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
	LLAMA_LOG_WARN("%s: \n", __func__);
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
	} else if (
	tokenizer_pre == "default") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
	} else if (
	tokenizer_pre == "llama3" \|\|
	tokenizer_pre == "llama-v3" \|\|
	tokenizer_pre == "llama-bpe") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
	} else if (
	tokenizer_pre == "deepseek-llm") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
	} else if (
	tokenizer_pre == "deepseek-coder") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
	} else if (
	tokenizer_pre == "falcon") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
	} else if (
	tokenizer_pre == "mpt") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
	} else if (
	tokenizer_pre == "starcoder") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
	} else if (
	tokenizer_pre == "gpt-2" \|\|
	tokenizer_pre == "jina-es" \|\|
	tokenizer_pre == "jina-de" \|\|
	tokenizer_pre == "jina-v2-es" \|\|
	tokenizer_pre == "jina-v2-de") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
	} else if (
	tokenizer_pre == "refact") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
	} else if (
	tokenizer_pre == "command-r") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
	} else if (
	tokenizer_pre == "qwen2") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
	} else if (
	tokenizer_pre == "olmo") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
	} else if (
	tokenizer_pre == "dbrx") {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
	} else {
	throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
	}
	} else {
	vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
	}

Add phi-2 tokenizer #7300

Are you sure you want to change the base?

Add phi-2 tokenizer #7300

Conversation

BramVanroy commented May 15, 2024

linpan commented May 17, 2024

ggerganov May 17, 2024

Choose a reason for hiding this comment

teleprint-me May 17, 2024 • edited

Choose a reason for hiding this comment

turian commented May 22, 2024

teleprint-me May 17, 2024 •

edited