From c41e4f27ec676896224fbdaff2b79e3aa30fe342 Mon Sep 17 00:00:00 2001 From: jiltseb Date: Fri, 12 Apr 2024 10:32:10 +0000 Subject: [PATCH 1/2] added 'use_vad_model' to better handle vad segments --- faster_whisper/transcribe.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 0908dc6c..41966619 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -125,6 +125,7 @@ class BatchedInferencePipeline(Pipeline): def __init__( self, model, + use_vad_model: bool = True, options: Optional[NamedTuple] = None, tokenizer=None, device: Union[int, str, "torch.device"] = -1, @@ -138,6 +139,7 @@ def __init__( self.preset_language = language self._batch_size = kwargs.pop("batch_size", None) self._num_workers = 1 + self.use_vad_model = use_vad_model self.vad_onset = 0.500 self.vad_offset = 0.363 self.vad_model_url = "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin" @@ -161,10 +163,11 @@ def __init__( else: self.device = device - # load vad model and perform VAD preprocessing if needed - self.vad_model = self.load_vad_model( - vad_onset=self.vad_onset, vad_offset=self.vad_offset - ) + if self.use_vad_model: + # load vad model and perform VAD preprocessing if needed + self.vad_model = self.load_vad_model( + vad_onset=self.vad_onset, vad_offset=self.vad_offset + ) self.chunk_size = 30 # VAD merging size super(Pipeline, self).__init__() @@ -483,15 +486,18 @@ def transcribe( # if no segment split is provided, use vad_model and generate segments if not vad_segments: - vad_segments = self.vad_model( - {"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000} - ) - vad_segments = merge_chunks( - vad_segments, - self.chunk_size, - onset=self.vad_onset, - offset=self.vad_offset, - ) + if self.use_vad_model: + vad_segments = self.vad_model( + {"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000} + ) + vad_segments = merge_chunks( + vad_segments, + self.chunk_size, + onset=self.vad_onset, + offset=self.vad_offset, + ) + else: + raise RuntimeError("no vad_model found. Set 'use_vad_model' to True while loading the model") language, language_probability, task = self.get_language_and_tokenizer( audio, task, language From 0e8fa00b64d57a67b1efa848dbcb30b20d313847 Mon Sep 17 00:00:00 2001 From: jiltseb Date: Fri, 12 Apr 2024 11:54:36 +0000 Subject: [PATCH 2/2] Update error message --- faster_whisper/transcribe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 41966619..a3583b2a 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -497,7 +497,7 @@ def transcribe( offset=self.vad_offset, ) else: - raise RuntimeError("no vad_model found. Set 'use_vad_model' to True while loading the model") + raise RuntimeError("No vad segments found. Set 'use_vad_model' to True while loading the model") language, language_probability, task = self.get_language_and_tokenizer( audio, task, language