mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-27 09:08:55 +01:00
models : simplify the conversion script
"transformers" dependency is not actually needed
This commit is contained in:
parent
55a0e1a64e
commit
e70e5c8b53
@ -40,8 +40,8 @@ import code
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from transformers import GPTJForCausalLM
|
||||
from transformers import GPT2TokenizerFast
|
||||
#from transformers import GPTJForCausalLM
|
||||
#from transformers import GPT2TokenizerFast
|
||||
|
||||
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
|
||||
LANGUAGES = {
|
||||
@ -146,25 +146,25 @@ LANGUAGES = {
|
||||
"su": "sundanese",
|
||||
}
|
||||
|
||||
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
|
||||
def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained(path)
|
||||
|
||||
specials = [
|
||||
"<|startoftranscript|>",
|
||||
*[f"<|{lang}|>" for lang in LANGUAGES.keys()],
|
||||
"<|translate|>",
|
||||
"<|transcribe|>",
|
||||
"<|startoflm|>",
|
||||
"<|startofprev|>",
|
||||
"<|nocaptions|>",
|
||||
"<|notimestamps|>",
|
||||
]
|
||||
|
||||
tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
|
||||
return tokenizer
|
||||
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
|
||||
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
|
||||
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
# path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
|
||||
# tokenizer = GPT2TokenizerFast.from_pretrained(path)
|
||||
#
|
||||
# specials = [
|
||||
# "<|startoftranscript|>",
|
||||
# *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
|
||||
# "<|translate|>",
|
||||
# "<|transcribe|>",
|
||||
# "<|startoflm|>",
|
||||
# "<|startofprev|>",
|
||||
# "<|nocaptions|>",
|
||||
# "<|notimestamps|>",
|
||||
# ]
|
||||
#
|
||||
# tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
|
||||
# return tokenizer
|
||||
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
def bytes_to_unicode():
|
||||
@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
|
||||
#code.interact(local=locals())
|
||||
|
||||
multilingual = hparams["n_vocab"] == 51865
|
||||
tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
||||
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
|
||||
|
||||
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
||||
#print(tokenizer)
|
||||
#print(tokenizer.name_or_path)
|
||||
#print(len(tokenizer.additional_special_tokens))
|
||||
dir_tokenizer = tokenizer.name_or_path
|
||||
|
||||
# output in the same directory as the model
|
||||
fname_out = dir_out + "/ggml-model.bin"
|
||||
|
Loading…
Reference in New Issue
Block a user