ggerganov · tamo · Jan 1, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025
diff --git a/examples/common-sdl.cpp b/examples/common-sdl.cpp
@@ -130,6 +130,7 @@ bool audio_async::clear() {
 
         m_audio_pos = 0;
         m_audio_len = 0;
+        m_audio_nxt = 0;
     }
 
     return true;
@@ -172,6 +173,28 @@ void audio_async::callback(uint8_t * stream, int len) {
 }
 
 void audio_async::get(int ms, std::vector<float> & result) {
+    if (ms <= 0) {
+        ms = m_len_ms;
+    }
+
+    size_t n_samples = std::min<size_t>(m_audio_len, (m_sample_rate * ms) / 1000);
+
+    get_n(n_samples, result);
+}
+
+void audio_async::next(std::vector<float> & result) {
+    size_t n_samples;
+
+    if (m_audio_pos >= m_audio_nxt) {
+        n_samples = m_audio_pos - m_audio_nxt;
+    } else {
+        n_samples = m_audio_len - m_audio_nxt + m_audio_pos;
+    }
+
+    get_n(n_samples, result);
+}
+
+void audio_async::get_n(size_t n_samples, std::vector<float> & result) {
     if (!m_dev_id_in) {
         fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
         return;
@@ -182,20 +205,9 @@ void audio_async::get(int ms, std::vector<float> & result) {
         return;
     }
 
-    result.clear();
-
     {
         std::lock_guard<std::mutex> lock(m_mutex);
 
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-
         result.resize(n_samples);
 
         int s0 = m_audio_pos - n_samples;
@@ -205,10 +217,12 @@ void audio_async::get(int ms, std::vector<float> & result) {
 
         if (s0 + n_samples > m_audio.size()) {
             const size_t n0 = m_audio.size() - s0;
+            m_audio_nxt = n_samples - n0;
 
             memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], m_audio_nxt * sizeof(float));
         } else {
+            m_audio_nxt = s0 + n_samples;
             memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
         }
     }

diff --git a/examples/common-sdl.h b/examples/common-sdl.h
@@ -30,6 +30,8 @@ class audio_async {
 
     // get audio data from the circular buffer
     void get(int ms, std::vector<float> & audio);
+    void next(std::vector<float> & audio);
+    void get_n(size_t n_samples, std::vector<float> & audio);
 
 private:
     SDL_AudioDeviceID m_dev_id_in = 0;
@@ -43,6 +45,7 @@ class audio_async {
     std::vector<float> m_audio;
     size_t             m_audio_pos = 0;
     size_t             m_audio_len = 0;
+    size_t             m_audio_nxt = 0;
 };
 
 // Return false if need to quit

diff --git a/examples/stream/README.md b/examples/stream/README.md
@@ -12,7 +12,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
 
 ## Sliding window mode with VAD
 
-Setting the `--step` argument to `0` enables the sliding window mode:
+Setting the `--step` argument to `0` or a negative value enables the sliding window mode:
 
 ```bash
  ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
@@ -25,6 +25,17 @@ It's best to tune it to the specific use case, but a value around `0.6` should b
 When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
 a transcription block that is suitable for parsing.
 
+You can also set the `--interim` argument to force transcription before the VAD detects silence.
+
+```bash
+ ./build/bin/stream -m ./models/ggml-base.en.bin -t 6 --step -2000 --length 10000 -vth 0.6 --interim --keep 200
+```
+
+This will transcribe the audio, keeping the last segment unconfirmed, every two seconds
+even if the VAD says the speech is still ongoing. In this mode, if the sentence doesn't end
+in `--length` milliseconds, the time window will not slide. The audio will be cut there
+to be transcribed anyway, keeping the last `--keep` milliseconds for the next inference.
+
 ## Building
 
 The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: