diff --git a/examples/common-sdl.cpp b/examples/common-sdl.cpp index b61f8cff5fd..6272ce838a7 100644 --- a/examples/common-sdl.cpp +++ b/examples/common-sdl.cpp @@ -130,6 +130,7 @@ bool audio_async::clear() { m_audio_pos = 0; m_audio_len = 0; + m_audio_nxt = 0; } return true; @@ -172,6 +173,28 @@ void audio_async::callback(uint8_t * stream, int len) { } void audio_async::get(int ms, std::vector & result) { + if (ms <= 0) { + ms = m_len_ms; + } + + size_t n_samples = std::min(m_audio_len, (m_sample_rate * ms) / 1000); + + get_n(n_samples, result); +} + +void audio_async::next(std::vector & result) { + size_t n_samples; + + if (m_audio_pos >= m_audio_nxt) { + n_samples = m_audio_pos - m_audio_nxt; + } else { + n_samples = m_audio_len - m_audio_nxt + m_audio_pos; + } + + get_n(n_samples, result); +} + +void audio_async::get_n(size_t n_samples, std::vector & result) { if (!m_dev_id_in) { fprintf(stderr, "%s: no audio device to get audio from!\n", __func__); return; @@ -182,20 +205,9 @@ void audio_async::get(int ms, std::vector & result) { return; } - result.clear(); - { std::lock_guard lock(m_mutex); - if (ms <= 0) { - ms = m_len_ms; - } - - size_t n_samples = (m_sample_rate * ms) / 1000; - if (n_samples > m_audio_len) { - n_samples = m_audio_len; - } - result.resize(n_samples); int s0 = m_audio_pos - n_samples; @@ -205,10 +217,12 @@ void audio_async::get(int ms, std::vector & result) { if (s0 + n_samples > m_audio.size()) { const size_t n0 = m_audio.size() - s0; + m_audio_nxt = n_samples - n0; memcpy(result.data(), &m_audio[s0], n0 * sizeof(float)); - memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float)); + memcpy(&result[n0], &m_audio[0], m_audio_nxt * sizeof(float)); } else { + m_audio_nxt = s0 + n_samples; memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float)); } } diff --git a/examples/common-sdl.h b/examples/common-sdl.h index 9ee8a320724..746493f7c83 100644 --- a/examples/common-sdl.h +++ b/examples/common-sdl.h @@ -30,6 +30,8 @@ class audio_async { // get audio data from the circular buffer void get(int ms, std::vector & audio); + void next(std::vector & audio); + void get_n(size_t n_samples, std::vector & audio); private: SDL_AudioDeviceID m_dev_id_in = 0; @@ -43,6 +45,7 @@ class audio_async { std::vector m_audio; size_t m_audio_pos = 0; size_t m_audio_len = 0; + size_t m_audio_nxt = 0; }; // Return false if need to quit diff --git a/examples/stream/README.md b/examples/stream/README.md index f07cfb8915c..fb0ba37809d 100644 --- a/examples/stream/README.md +++ b/examples/stream/README.md @@ -12,7 +12,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a ## Sliding window mode with VAD -Setting the `--step` argument to `0` enables the sliding window mode: +Setting the `--step` argument to `0` or a negative value enables the sliding window mode: ```bash ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 @@ -25,6 +25,17 @@ It's best to tune it to the specific use case, but a value around `0.6` should b When silence is detected, it will transcribe the last `--length` milliseconds of audio and output a transcription block that is suitable for parsing. +You can also set the `--interim` argument to force transcription before the VAD detects silence. + +```bash + ./build/bin/stream -m ./models/ggml-base.en.bin -t 6 --step -2000 --length 10000 -vth 0.6 --interim --keep 200 +``` + +This will transcribe the audio, keeping the last segment unconfirmed, every two seconds +even if the VAD says the speech is still ongoing. In this mode, if the sentence doesn't end +in `--length` milliseconds, the time window will not slide. The audio will be cut there +to be transcribed anyway, keeping the last `--keep` milliseconds for the next inference. + ## Building The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 190f68a2c3b..38a15c448e4 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -7,11 +7,23 @@ #include "whisper.h" #include +#include #include +#include +#include #include #include #include #include +#include + +#ifdef _WIN32 + #define NOMINMAX + #include + #include +#else + #include +#endif // command-line parameters @@ -21,8 +33,9 @@ struct whisper_params { int32_t length_ms = 10000; int32_t keep_ms = 200; int32_t capture_id = -1; - int32_t max_tokens = 32; + int32_t max_tokens = 128; int32_t audio_ctx = 0; + int32_t n_tmp_segs = 1; float vad_thold = 0.6f; float freq_thold = 100.0f; @@ -36,6 +49,8 @@ struct whisper_params { bool save_audio = false; // save audio to wav file bool use_gpu = true; bool flash_attn = false; + bool interim = false; + bool delete_vt100 = true; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; @@ -65,6 +80,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; } + else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } @@ -72,6 +88,8 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } + else if (arg == "-int" || arg == "--interim") { params.interim = true; } + else if (arg == "-nvt" || arg == "--no-vt100") { params.delete_vt100 = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -102,6 +120,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true"); + fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false"); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); @@ -109,6 +128,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false"); fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " -int, --interim [%-7s] show interim report in vad every step\n", params.interim ? "true" : "false"); + fprintf(stderr, " -nvt, --no-vt100 [%-7s] do not delete unconfirmed result\n", params.delete_vt100 ? "false" : "true"); fprintf(stderr, "\n"); } @@ -119,31 +140,52 @@ int main(int argc, char ** argv) { return 1; } - params.keep_ms = std::min(params.keep_ms, params.step_ms); - params.length_ms = std::max(params.length_ms, params.step_ms); - - const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE; - const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE; - const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE; - const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE; - - const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD + if (params.step_ms == 0) { + params.step_ms = -2000; // reasonable default for VAD + } + params.keep_ms = std::min(params.keep_ms, abs(params.step_ms)); + params.length_ms = std::max(params.length_ms, abs(params.step_ms)); - const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line + const int n_samples_step = (1e-3*abs(params.step_ms))*WHISPER_SAMPLE_RATE; + const int n_samples_len = (1e-3*params.length_ms )*WHISPER_SAMPLE_RATE; + const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE; + const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE; + const int n_samples_100ms= (1e-3*100.0 )*WHISPER_SAMPLE_RATE; - params.no_timestamps = !use_vad; - params.no_context |= use_vad; - params.max_tokens = 0; + const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD + const bool piped = !isatty(fileno(stdin)); // init audio audio_async audio(params.length_ms); - if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) { - fprintf(stderr, "%s: audio.init() failed!\n", __func__); - return 1; - } - audio.resume(); + if (!piped) { + if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) { + fprintf(stderr, "%s: audio.init() failed!\n", __func__); + return 1; + } + + audio.resume(); + } else { + fprintf(stderr, "%s: audio is from stdin, not from microphone\n", __func__); + + #ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); + #else + freopen(NULL, "rb", stdin); + #endif + + // non-blocking mode + #ifdef _WIN32 + DWORD mode; + HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); + GetConsoleMode(stdinHandle, &mode); + mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + SetConsoleMode(stdinHandle, mode); + #else + fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK); + #endif + } // whisper init if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){ @@ -159,9 +201,10 @@ int main(int argc, char ** argv) { struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); - std::vector pcmf32 (n_samples_30s, 0.0f); - std::vector pcmf32_old; - std::vector pcmf32_new(n_samples_30s, 0.0f); + std::vector pcmf32(n_samples_30s, 0.0f); + std::deque pcmf32_deque; + int n_samples_new = 0; + int n_samples_old = 0; std::vector prompt_tokens; @@ -186,10 +229,10 @@ int main(int argc, char ** argv) { params.translate ? "translate" : "transcribe", params.no_timestamps ? 0 : 1); - if (!use_vad) { - fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context); - } else { + fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context); + if (use_vad) { fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__); + fprintf(stderr, "%s: interim report = %d, temporary segments = %d\n", __func__, params.interim, params.n_tmp_segs); } fprintf(stderr, "\n"); @@ -219,17 +262,16 @@ int main(int argc, char ** argv) { wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1); } - printf("[Start speaking]\n"); - fflush(stdout); + fprintf(stderr, "[Start speaking]\n"); + fflush(stderr); auto t_last = std::chrono::high_resolution_clock::now(); const auto t_start = t_last; + auto t_interim = t_last; + std::string s_tmp = ""; // main audio loop while (is_running) { - if (params.save_audio) { - wavWriter.write(pcmf32_new.data(), pcmf32_new.size()); - } // handle Ctrl + C is_running = sdl_poll_events(); @@ -238,62 +280,114 @@ int main(int argc, char ** argv) { } // process new audio + const auto t_now = std::chrono::high_resolution_clock::now(); + const auto t_diff = std::chrono::duration_cast(t_now - t_last).count(); - if (!use_vad) { - while (true) { - audio.get(params.step_ms, pcmf32_new); + if (!piped) { + const auto sleep_ms = abs(params.step_ms) - t_diff; + if (sleep_ms > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + continue; + } - if ((int) pcmf32_new.size() > 2*n_samples_step) { - fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__); - audio.clear(); - continue; - } + audio.next(pcmf32); - if ((int) pcmf32_new.size() >= n_samples_step) { - audio.clear(); + if ((int) pcmf32.size() > 2*n_samples_step) { + fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__); + fprintf(stderr, "t_diff = %.2f sec, prev = %.2f sec, got = %.2f sec\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(pcmf32.size())/WHISPER_SAMPLE_RATE); + n_samples_old = 0; + n_samples_new = 0; + t_last = t_now; + continue; + } + } else { + // piped: need at least step_ms but try to get length_ms at first + const auto n_bytes_min = std::max(0, (n_samples_step - n_samples_new) * sizeof(float)); + auto n_bytes_wanted = n_samples_len * sizeof(float); + pcmf32.resize(n_samples_len); + + long n_bytes_read = 0; + while (n_bytes_wanted > 0) { + char *p_buf = (char *)pcmf32.data(); + const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted); + if (n_read == 0 || n_read == -1 && errno != EAGAIN) { + fprintf(stderr, "read(stdin) returned %zd, errno = %s\n", n_read, strerror(errno)); + is_running = false; // flush all results break; } - - std::this_thread::sleep_for(std::chrono::milliseconds(1)); + n_bytes_read += std::max(0, n_read); + if (n_bytes_read < n_bytes_min) { + n_bytes_wanted = n_bytes_min - n_bytes_read; + } else { + const auto n_mod = n_bytes_read % sizeof(float); + n_bytes_wanted = (n_mod != 0) ? sizeof(float) - n_mod : 0; + } + const auto est_sleep_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE; + std::this_thread::sleep_for(std::chrono::milliseconds(est_sleep_ms)); } + pcmf32.resize(n_bytes_read / sizeof(float)); + } - const int n_samples_new = pcmf32_new.size(); + int n_samples_buf = pcmf32.size(); - // take up to params.length_ms audio from previous iteration - const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new)); + if (params.save_audio && n_samples_buf > 0) { + wavWriter.write(pcmf32.data(), n_samples_buf); + } - //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size()); + copy(pcmf32.begin(), pcmf32.end(), std::back_inserter(pcmf32_deque)); + if (pcmf32_deque.size() > n_samples_30s) { + pcmf32_deque.erase(pcmf32_deque.begin(), pcmf32_deque.end() - n_samples_30s); + } - pcmf32.resize(n_samples_new + n_samples_take); + n_samples_new += n_samples_buf; - for (int i = 0; i < n_samples_take; i++) { - pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i]; - } + if (n_samples_old + n_samples_new == 0) { + continue; + } - memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float)); + // prepare pcmf32 for inference + n_samples_buf = n_samples_old + n_samples_new; + pcmf32.resize(n_samples_buf); + copy(pcmf32_deque.end() - n_samples_buf, pcmf32_deque.end(), pcmf32.begin()); - pcmf32_old = pcmf32; - } else { - const auto t_now = std::chrono::high_resolution_clock::now(); - const auto t_diff = std::chrono::duration_cast(t_now - t_last).count(); + // chop the audio unconditionally + bool use_keep_ms = ((!use_vad || params.interim) && n_samples_buf > n_samples_len); - if (t_diff < 2000) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // interim report in vad mode: once every step_ms, + // run the inference even if vad returns false, + // confirm (n_segments - params.n_tmp_segs) segments, + // and print other segments as s_tmp, which will be deleted + bool is_interim = false; - continue; - } + if (!use_vad || use_keep_ms || !is_running) { + use_keep_ms = true; + n_samples_old += n_samples_new; + n_samples_new = 0; - audio.get(2000, pcmf32_new); + t_last = t_now; + } else { + if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { + n_samples_new = 0; + n_samples_old = 0; - if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { - audio.get(params.length_ms, pcmf32); + t_last = t_now; } else { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + const auto interim_diff_ms = std::chrono::duration_cast(t_now - t_interim).count(); - continue; + if (params.interim && interim_diff_ms > abs(params.step_ms)) { + is_interim = true; + n_samples_old += n_samples_new; + n_samples_new = 0; + } else { + // sliding window + n_samples_new -= n_samples_100ms; + n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms); + if (!piped) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + continue; + } } - - t_last = t_now; } // run the inference @@ -321,89 +415,156 @@ int main(int argc, char ** argv) { wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); - if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { - fprintf(stderr, "%s: failed to process audio\n", argv[0]); - return 6; + // call whisper_full() with at least 1 sec of buffer + { + auto pcm_size = pcmf32.size(); + if (pcm_size < WHISPER_SAMPLE_RATE * 1.1) { + pcmf32.resize(pcm_size + WHISPER_SAMPLE_RATE, 0.0f); + } + if (whisper_full(ctx, wparams, pcmf32.data(), pcm_size) != 0) { + fprintf(stderr, "%s: failed to process audio\n", argv[0]); + return 6; + } + pcmf32.resize(pcm_size); } + t_interim = std::chrono::high_resolution_clock::now(); // print result; + int n_segments; + bool is_all_tmp = (!use_vad && n_samples_old < n_samples_len - n_samples_step); + std::ostringstream ss_output; + { - if (!use_vad) { + if (params.delete_vt100 && s_tmp.size()) { printf("\33[2K\r"); // print long empty line to clear the previous line - printf("%s", std::string(100, ' ').c_str()); + printf("%s", std::string(s_tmp.size(), ' ').c_str()); printf("\33[2K\r"); - } else { + } + s_tmp.clear(); + + n_segments = whisper_full_n_segments(ctx); + is_all_tmp = (is_running && (is_all_tmp || is_interim && n_segments <= params.n_tmp_segs)); + if (is_running && is_interim && !is_all_tmp) { + const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - params.n_tmp_segs - 1) * 10; + if (t1_ms < abs(params.step_ms)) { + // too short to confirm + is_all_tmp = true; + } else { + t_last += std::chrono::milliseconds(t1_ms); + const auto n_samples_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE; + pcmf32.resize(n_samples_confirmed); // for timestamps + n_samples_old -= n_samples_confirmed; // kept for next iteration + } + } + + bool show_n_iter = (use_vad && !params.no_timestamps && !is_all_tmp); + + if (show_n_iter) { const int64_t t1 = (t_last - t_start).count()/1000000; const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); - printf("\n"); - printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1); - printf("\n"); + ss_output << std::endl; + ss_output << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl; + ss_output << std::endl; } - const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { - const char * text = whisper_full_get_segment_text(ctx, i); + std::string text = whisper_full_get_segment_text(ctx, i); - if (params.no_timestamps) { - printf("%s", text); - fflush(stdout); + // last segment(s) may be s_tmp + if (i >= n_segments - params.n_tmp_segs && is_running && (is_all_tmp || is_interim)) { + if (params.no_timestamps && i > 0) { + ss_output << std::endl; + } + if (is_interim) { + // utf-8 cannot be simply cut, so use char32_t + std::wstring_convert, char32_t> conv; + const auto s_u32 = conv.from_bytes(text); + const auto s_sub = conv.to_bytes(s_u32.substr(0, s_u32.size() * 0.9)); + text = s_sub + "…"; + } + if (s_tmp.size() > 0) { + s_tmp += " "; + } + s_tmp += text; + continue; + } - if (params.fname_out.length() > 0) { - fout << text; + if (is_all_tmp) { + if (s_tmp.size() > 0) { + s_tmp += " "; + } + s_tmp += text; + } else if (params.no_timestamps) { + if (i > 0) { + ss_output << std::endl; } + ss_output << text; } else { - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + const int64_t t_end = (t_last - t_start).count()/1000000; + const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); + const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = t_beg/10 + whisper_full_get_segment_t1(ctx, i); - std::string output = "[" + to_timestamp(t0, false) + " --> " + to_timestamp(t1, false) + "] " + text; + ss_output << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "] " << text; if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { - output += " [SPEAKER_TURN]"; + ss_output << " [SPEAKER_TURN]"; } - output += "\n"; - - printf("%s", output.c_str()); - fflush(stdout); - - if (params.fname_out.length() > 0) { - fout << output; - } + ss_output << std::endl; } } - if (params.fname_out.length() > 0) { - fout << std::endl; + if (show_n_iter) { + ss_output << std::endl; + ss_output << "### Transcription " << n_iter << " END" << std::endl; + if (s_tmp.size() > 0) { + ss_output << std::endl; + } } + } - if (use_vad) { - printf("\n"); - printf("### Transcription %d END\n", n_iter); - } + if (params.fname_out.length() > 0) { + fout << ss_output.str(); + fout << std::endl; } - ++n_iter; + if (!is_all_tmp) { + ++n_iter; + } - if (!use_vad && (n_iter % n_new_line) == 0) { + printf("%s", ss_output.str().c_str()); + + if (s_tmp.size() > 0) { + if (!params.delete_vt100) { + s_tmp = "(" + s_tmp + ")\n"; + } + printf("%s", s_tmp.c_str()); + + // exclude s_tmp from context + n_segments -= is_all_tmp ? n_segments : params.n_tmp_segs; + } else { printf("\n"); + s_tmp = ""; - // keep part of the audio for next iteration to try to mitigate word boundary issues - pcmf32_old = std::vector(pcmf32.end() - n_samples_keep, pcmf32.end()); + if (use_keep_ms) { + // keep part of the audio for next iteration to try to mitigate word boundary issues + n_samples_old = std::min(n_samples_old, n_samples_keep); + } + } - // Add tokens of the last full length segment as the prompt - if (!params.no_context) { - prompt_tokens.clear(); + // Add tokens of the last full length segment as the prompt + if (n_segments > 0 && !params.no_context) { + prompt_tokens.clear(); - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const int token_count = whisper_full_n_tokens(ctx, i); - for (int j = 0; j < token_count; ++j) { - prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j)); - } + for (int i = 0; i < n_segments; ++i) { + const int token_count = whisper_full_n_tokens(ctx, i); + for (int j = 0; j < token_count; ++j) { + prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j)); } } }