server : backport .srt output format (#1565)

This commit adds a support of .srt format to Whisper server. The code is
effectively backported from examples/main. The output mimetype is set to
application/x-subrip as per https://en.wikipedia.org/wiki/SubRip.

Example usage:

  curl 127.0.0.1:8080/inference \
    -H "Content-Type: multipart/form-data" \
    -F file="@<file-path>" \
    -F temperature="0.2" \
    -F response-format="srt"
This commit is contained in:
Oleg Sidorov 2023-11-28 14:42:58 +01:00 committed by GitHub
parent 73d5005880
commit 6559b538e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,6 +11,7 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
#include <cstring> #include <cstring>
#include <sstream>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -657,6 +658,27 @@ int main(int argc, char ** argv) {
std::string results = output_str(ctx, params, pcmf32s); std::string results = output_str(ctx, params, pcmf32s);
res.set_content(results.c_str(), "text/html"); res.set_content(results.c_str(), "text/html");
} }
else if (params.response_format == srt_format)
{
std::stringstream ss;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
std::string speaker = "";
if (params.diarize && pcmf32s.size() == 2)
{
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
}
ss << i + 1 + params.offset_n << "\n";
ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
ss << speaker << text << "\n\n";
}
res.set_content(ss.str(), "application/x-subrip");
}
// TODO add more output formats // TODO add more output formats
else else
{ {