-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathassistant.py
176 lines (151 loc) · 6.63 KB
/
assistant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A simple example showcasing the use of `pywhispercpp` as an assistant.
The idea is to use a `VAD` to detect speech (in this example we used webrtcvad), and when speech is detected
we run the inference.
"""
import argparse
import importlib.metadata
import queue
import time
from typing import Callable
import numpy as np
import sounddevice as sd
import pywhispercpp.constants as constants
import webrtcvad
import logging
from pywhispercpp._logger import set_log_level
from pywhispercpp.model import Model
__version__ = importlib.metadata.version('pywhispercpp')
__header__ = f"""
=====================================
PyWhisperCpp
A simple assistant using Whisper.cpp
Version: {__version__}
=====================================
"""
class Assistant:
"""
Assistant class
Example usage
```python
from pywhispercpp.examples.assistant import Assistant
my_assistant = Assistant(commands_callback=print, n_threads=8)
my_assistant.start()
```
"""
def __init__(self,
model='tiny',
input_device: int = None,
silence_threshold: int = 8,
q_threshold: int = 16,
block_duration: int = 30,
commands_callback: Callable[[str], None] = None,
model_log_level: int = logging.INFO,
**model_params):
"""
:param model: whisper.cpp model name or a direct path to a`ggml` model
:param input_device: The input device (aka microphone), keep it None to take the default
:param silence_threshold: The duration of silence after which the inference will be running
:param q_threshold: The inference won't be running until the data queue is having at least `q_threshold` elements
:param block_duration: minimum time audio updates in ms
:param commands_callback: The callback to run when a command is received
:param model_log_level: Logging level
:param model_params: any other parameter to pass to the whsiper.cpp model see ::: pywhispercpp.constants.PARAMS_SCHEMA
"""
self.input_device = input_device
self.sample_rate = constants.WHISPER_SAMPLE_RATE # same as whisper.cpp
self.channels = 1 # same as whisper.cpp
self.block_duration = block_duration
self.block_size = int(self.sample_rate * self.block_duration / 1000)
self.q = queue.Queue()
self.vad = webrtcvad.Vad()
self.silence_threshold = silence_threshold
self.q_threshold = q_threshold
self._silence_counter = 0
self.pwccp_model = Model(model,
log_level=model_log_level,
print_realtime=False,
print_progress=False,
print_timestamps=False,
single_segment=True,
no_context=True,
**model_params)
self.commands_callback = commands_callback
def _audio_callback(self, indata, frames, time, status):
"""
This is called (from a separate thread) for each audio block.
"""
if status:
logging.warning(F"underlying audio stack warning:{status}")
assert frames == self.block_size
audio_data = map(lambda x: (x + 1) / 2, indata) # normalize from [-1,+1] to [0,1]
audio_data = np.fromiter(audio_data, np.float16)
audio_data = audio_data.tobytes()
detection = self.vad.is_speech(audio_data, self.sample_rate)
if detection:
self.q.put(indata.copy())
self._silence_counter = 0
else:
if self._silence_counter >= self.silence_threshold:
if self.q.qsize() > self.q_threshold:
self._transcribe_speech()
self._silence_counter = 0
else:
self._silence_counter += 1
def _transcribe_speech(self):
logging.info(f"Speech detected ...")
audio_data = np.array([])
while self.q.qsize() > 0:
# get all the data from the q
audio_data = np.append(audio_data, self.q.get())
# Appending zeros to the audio data as a workaround for small audio packets (small commands)
audio_data = np.concatenate([audio_data, np.zeros((int(self.sample_rate) + 10))])
# running the inference
self.pwccp_model.transcribe(audio_data,
new_segment_callback=self._new_segment_callback)
def _new_segment_callback(self, seg):
if self.commands_callback:
self.commands_callback(seg[0].text)
def start(self) -> None:
"""
Use this function to start the assistant
:return: None
"""
logging.info(f"Starting Assistant ...")
with sd.InputStream(
device=self.input_device, # the default input device
channels=self.channels,
samplerate=constants.WHISPER_SAMPLE_RATE,
blocksize=self.block_size,
callback=self._audio_callback):
try:
logging.info(f"Assistant is listening ... (CTRL+C to stop)")
while True:
time.sleep(0.1)
except KeyboardInterrupt:
logging.info("Assistant stopped")
@staticmethod
def available_devices():
return sd.query_devices()
def _main():
parser = argparse.ArgumentParser(description="", allow_abbrev=True)
# Positional args
parser.add_argument('-m', '--model', default='tiny.en', type=str, help="Whisper.cpp model, default to %(default)s")
parser.add_argument('-ind', '--input_device', type=int, default=None,
help=f'Id of The input device (aka microphone)\n'
f'available devices {Assistant.available_devices()}')
parser.add_argument('-st', '--silence_threshold', default=16, type=int,
help=f"he duration of silence after which the inference will be running, default to %(default)s")
parser.add_argument('-bd', '--block_duration', default=30,
help=f"minimum time audio updates in ms, default to %(default)s")
args = parser.parse_args()
my_assistant = Assistant(model=args.model,
input_device=args.input_device,
silence_threshold=args.silence_threshold,
block_duration=args.block_duration,
commands_callback=print)
my_assistant.start()
if __name__ == '__main__':
_main()