mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-10 10:38:18 +02:00
whisper : fix the bug related to word splitting errors in the "tokenize" function. (#760)
Co-authored-by: AfryMask <afrymask@gmail.com>
This commit is contained in:
parent
1c5edc3cb3
commit
7e2afa4384
17
whisper.cpp
17
whisper.cpp
@ -2449,25 +2449,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
|||||||
int n = word.size();
|
int n = word.size();
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
int j = n;
|
int j = n;
|
||||||
|
bool found = false;
|
||||||
while (j > i) {
|
while (j > i) {
|
||||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
auto sub = word.substr(i, j-i);
|
||||||
|
auto it = vocab.token_to_id.find(sub);
|
||||||
if (it != vocab.token_to_id.end()) {
|
if (it != vocab.token_to_id.end()) {
|
||||||
tokens.push_back(it->second);
|
tokens.push_back(it->second);
|
||||||
i = j;
|
i = j;
|
||||||
|
found = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
--j;
|
--j;
|
||||||
}
|
}
|
||||||
if (i == n) {
|
if (!found) {
|
||||||
break;
|
fprintf(stderr, "unknown token \n");
|
||||||
}
|
|
||||||
if (j == i) {
|
|
||||||
auto sub = word.substr(i, 1);
|
|
||||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
|
||||||
tokens.push_back(vocab.token_to_id.at(sub));
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
|
||||||
}
|
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user