mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-28 01:29:17 +01:00
models : simplify the conversion script
"transformers" dependency is not actually needed
This commit is contained in:
parent
55a0e1a64e
commit
e70e5c8b53
@ -40,8 +40,8 @@ import code
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from transformers import GPTJForCausalLM
|
#from transformers import GPTJForCausalLM
|
||||||
from transformers import GPT2TokenizerFast
|
#from transformers import GPT2TokenizerFast
|
||||||
|
|
||||||
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
|
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
|
||||||
LANGUAGES = {
|
LANGUAGES = {
|
||||||
@ -146,25 +146,25 @@ LANGUAGES = {
|
|||||||
"su": "sundanese",
|
"su": "sundanese",
|
||||||
}
|
}
|
||||||
|
|
||||||
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
|
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
|
||||||
def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
|
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
|
# path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
|
||||||
tokenizer = GPT2TokenizerFast.from_pretrained(path)
|
# tokenizer = GPT2TokenizerFast.from_pretrained(path)
|
||||||
|
#
|
||||||
specials = [
|
# specials = [
|
||||||
"<|startoftranscript|>",
|
# "<|startoftranscript|>",
|
||||||
*[f"<|{lang}|>" for lang in LANGUAGES.keys()],
|
# *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
|
||||||
"<|translate|>",
|
# "<|translate|>",
|
||||||
"<|transcribe|>",
|
# "<|transcribe|>",
|
||||||
"<|startoflm|>",
|
# "<|startoflm|>",
|
||||||
"<|startofprev|>",
|
# "<|startofprev|>",
|
||||||
"<|nocaptions|>",
|
# "<|nocaptions|>",
|
||||||
"<|notimestamps|>",
|
# "<|notimestamps|>",
|
||||||
]
|
# ]
|
||||||
|
#
|
||||||
tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
|
# tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
|
||||||
return tokenizer
|
# return tokenizer
|
||||||
|
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
def bytes_to_unicode():
|
def bytes_to_unicode():
|
||||||
@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
|
|||||||
#code.interact(local=locals())
|
#code.interact(local=locals())
|
||||||
|
|
||||||
multilingual = hparams["n_vocab"] == 51865
|
multilingual = hparams["n_vocab"] == 51865
|
||||||
tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
|
||||||
|
|
||||||
|
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
||||||
#print(tokenizer)
|
#print(tokenizer)
|
||||||
#print(tokenizer.name_or_path)
|
#print(tokenizer.name_or_path)
|
||||||
#print(len(tokenizer.additional_special_tokens))
|
#print(len(tokenizer.additional_special_tokens))
|
||||||
dir_tokenizer = tokenizer.name_or_path
|
|
||||||
|
|
||||||
# output in the same directory as the model
|
# output in the same directory as the model
|
||||||
fname_out = dir_out + "/ggml-model.bin"
|
fname_out = dir_out + "/ggml-model.bin"
|
||||||
|
Loading…
Reference in New Issue
Block a user