models : simplify the conversion script

"transformers" dependency is not actually needed
This commit is contained in:
Georgi Gerganov 2022-11-16 19:21:43 +02:00
parent 55a0e1a64e
commit e70e5c8b53
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -40,8 +40,8 @@ import code
import torch import torch
import numpy as np import numpy as np
from transformers import GPTJForCausalLM #from transformers import GPTJForCausalLM
from transformers import GPT2TokenizerFast #from transformers import GPT2TokenizerFast
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
LANGUAGES = { LANGUAGES = {
@ -146,25 +146,25 @@ LANGUAGES = {
"su": "sundanese", "su": "sundanese",
} }
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"): #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
os.environ["TOKENIZERS_PARALLELISM"] = "false" # os.environ["TOKENIZERS_PARALLELISM"] = "false"
path = os.path.join(path_to_whisper_repo, "whisper/assets", name) # path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
tokenizer = GPT2TokenizerFast.from_pretrained(path) # tokenizer = GPT2TokenizerFast.from_pretrained(path)
#
specials = [ # specials = [
"<|startoftranscript|>", # "<|startoftranscript|>",
*[f"<|{lang}|>" for lang in LANGUAGES.keys()], # *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
"<|translate|>", # "<|translate|>",
"<|transcribe|>", # "<|transcribe|>",
"<|startoflm|>", # "<|startoflm|>",
"<|startofprev|>", # "<|startofprev|>",
"<|nocaptions|>", # "<|nocaptions|>",
"<|notimestamps|>", # "<|notimestamps|>",
] # ]
#
tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) # tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
return tokenizer # return tokenizer
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode(): def bytes_to_unicode():
@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
#code.interact(local=locals()) #code.interact(local=locals())
multilingual = hparams["n_vocab"] == 51865 multilingual = hparams["n_vocab"] == 51865
tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2") dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
#print(tokenizer) #print(tokenizer)
#print(tokenizer.name_or_path) #print(tokenizer.name_or_path)
#print(len(tokenizer.additional_special_tokens)) #print(len(tokenizer.additional_special_tokens))
dir_tokenizer = tokenizer.name_or_path
# output in the same directory as the model # output in the same directory as the model
fname_out = dir_out + "/ggml-model.bin" fname_out = dir_out + "/ggml-model.bin"