mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-19 09:42:16 +02:00
ggml : sync latest ggml
- New Q4 and Q5 formats - Various improvements
This commit is contained in:
@@ -6,7 +6,6 @@
|
||||
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
||||
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
|
||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
||||
@@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
|
||||
switch (ftype) {
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
||||
@@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
|
||||
{
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
|
@@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
if (++i > argc) {
|
||||
fprintf(stderr, "Invalid file param");
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i]);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
break;
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
@@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
||||
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||
fprintf(stderr, " load prompt from a file\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
||||
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
||||
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
||||
@@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||
return result;
|
||||
}
|
||||
|
||||
void gpt_vocab::add_special_token(const std::string & token) {
|
||||
special_tokens.push_back(token);
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
@@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
||||
std::string str = text;
|
||||
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
|
||||
// Generate the subpattern from the special_tokens vector if it's not empty
|
||||
if (!vocab.special_tokens.empty()) {
|
||||
std::string special_tokens_subpattern;
|
||||
for (const auto & token : vocab.special_tokens) {
|
||||
if (!special_tokens_subpattern.empty()) {
|
||||
special_tokens_subpattern += "|";
|
||||
}
|
||||
special_tokens_subpattern += token;
|
||||
}
|
||||
|
||||
// Modify the regex pattern with the generated special tokens subpattern
|
||||
pat = special_tokens_subpattern + "|" + pat;
|
||||
}
|
||||
|
||||
std::regex re(pat);
|
||||
std::smatch m;
|
||||
|
||||
|
@@ -53,6 +53,9 @@ struct gpt_vocab {
|
||||
|
||||
std::map<token, id> token_to_id;
|
||||
std::map<id, token> id_to_token;
|
||||
std::vector<std::string> special_tokens;
|
||||
|
||||
void add_special_token(const std::string & token);
|
||||
};
|
||||
|
||||
// poor-man's JSON parsing
|
||||
|
@@ -25,7 +25,7 @@ struct whisper_hparams {
|
||||
int32_t n_text_head = 6;
|
||||
int32_t n_text_layer = 4;
|
||||
int32_t n_mels = 80;
|
||||
int32_t f16 = 1;
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct whisper_filters {
|
||||
@@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
||||
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
||||
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
||||
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
||||
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
||||
|
||||
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
||||
@@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
||||
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
||||
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
||||
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
||||
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
||||
fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||
fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
||||
|
||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
||||
@@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
||||
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
||||
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
||||
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
||||
fout.write((char *) &ftype, sizeof(hparams.f16));
|
||||
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
|
||||
}
|
||||
|
||||
// load mel filters
|
||||
|
Reference in New Issue
Block a user