minor updates

jhakulin · Feb 9, 2025 · 4dcdb41 · 4dcdb41
1 parent 1c664ae
commit 4dcdb41
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 42 deletions.
diff --git a/samples/async/sample_realtime_ai_with_keyword_and_vad.py b/samples/async/sample_realtime_ai_with_keyword_and_vad.py
@@ -267,8 +267,12 @@ async def on_response_output_item_added(self, event: ResponseOutputItemAdded) ->
                 # Properly acquire the lock with 'await' and spread the usage over two lines
                 await self._lock.acquire()  # Wait until the lock is available, then acquire it
                 try:
-                    self._call_id_to_function_name[call_id] = function_name
-                    logger.debug(f"Registered function call. Call ID: {call_id}, Function Name: {function_name}")
+                    # Only register the call_id if we haven't seen it before
+                    if call_id in self._call_id_to_function_name:
+                        logger.debug(f"Ignoring duplicated function call registration for call_id: {call_id}")
+                    else:
+                        self._call_id_to_function_name[call_id] = function_name
+                        logger.debug(f"Registered new function call. Call ID: {call_id}, Function Name: {function_name}")
                 finally:
                     # Ensure the lock is released even if an exception occurs
                     self._lock.release()
@@ -371,7 +375,6 @@ async def main():
     audio_capture = None
 
     try:
-
         azure_openai_endpoint, api_key, azure_api_version = get_openai_configuration()
         if not api_key:
             return
@@ -411,25 +414,35 @@ async def main():
         await client.start()
 
         loop = asyncio.get_running_loop()
-        
+
         audio_capture_event_handler = MyAudioCaptureEventHandler(
             client=client,
             event_handler=event_handler,
             event_loop=loop,
         )
-        vad_parameters={
+
+        # Set local VAD parameters depending on the VAD model used
+        if USE_SILERO_VAD_MODEL:
+            logger.info("Using Silero VAD...")
+            vad_parameters = {
+                "sample_rate": 24000,
+                "chunk_size": 1024,
+                "window_size_samples": 512,
+                "threshold": 0.25,
+                "min_speech_duration": 0.3,
+                "min_silence_duration": 1.0,
+                "model_path": str(RESOURCES_DIR / "silero_vad.onnx")
+            }
+        else:
+            logger.info("Using VoiceActivityDetector...")
+            vad_parameters = {
                 "sample_rate": 24000,
                 "chunk_size": 1024,
                 "window_duration": 1.5,
                 "silence_ratio": 1.5,
                 "min_speech_duration": 0.3,
                 "min_silence_duration": 1.0
             }
-        if USE_SILERO_VAD_MODEL:
-            logger.info("using Silero VAD...")
-            vad_parameters["model_path"] = str(RESOURCES_DIR / "silero_vad.onnx")
-        else:
-            logger.info("using VoiceActivityDetector...")
 
         # Initialize AudioCapture with the event handler
         audio_capture = AudioCapture(

diff --git a/samples/async/sample_realtime_ai_with_local_vad.py b/samples/async/sample_realtime_ai_with_local_vad.py
@@ -199,8 +199,12 @@ async def on_response_output_item_added(self, event: ResponseOutputItemAdded) ->
                 # Properly acquire the lock with 'await' and spread the usage over two lines
                 await self._lock.acquire()  # Wait until the lock is available, then acquire it
                 try:
-                    self._call_id_to_function_name[call_id] = function_name
-                    logger.debug(f"Registered function call. Call ID: {call_id}, Function Name: {function_name}")
+                    # Only register the call_id if we haven't seen it before
+                    if call_id in self._call_id_to_function_name:
+                        logger.debug(f"Ignoring duplicated function call registration for call_id: {call_id}")
+                    else:
+                        self._call_id_to_function_name[call_id] = function_name
+                        logger.debug(f"Registered new function call. Call ID: {call_id}, Function Name: {function_name}")
                 finally:
                     # Ensure the lock is released even if an exception occurs
                     self._lock.release()
@@ -319,25 +323,35 @@ async def main():
         await client.start()
 
         loop = asyncio.get_running_loop()
-        
+
         audio_capture_event_handler = MyAudioCaptureEventHandler(
             client=client,
             event_handler=event_handler,
             event_loop=loop,
         )
-        vad_parameters={
+
+        # Set local VAD parameters depending on the VAD model used
+        if USE_SILERO_VAD_MODEL:
+            logger.info("Using Silero VAD...")
+            vad_parameters = {
+                "sample_rate": 24000,
+                "chunk_size": 1024,
+                "window_size_samples": 512,
+                "threshold": 0.25,
+                "min_speech_duration": 0.3,
+                "min_silence_duration": 1.0,
+                "model_path": str(RESOURCES_DIR / "silero_vad.onnx")
+            }
+        else:
+            logger.info("Using VoiceActivityDetector...")
+            vad_parameters = {
                 "sample_rate": 24000,
                 "chunk_size": 1024,
                 "window_duration": 1.5,
                 "silence_ratio": 1.5,
                 "min_speech_duration": 0.3,
                 "min_silence_duration": 1.0
             }
-        if USE_SILERO_VAD_MODEL:
-            logger.info("using Silero VAD...")
-            vad_parameters["model_path"] = str(RESOURCES_DIR / "silero_vad.onnx")
-        else:
-            logger.info("using VoiceActivityDetector...")
 
         # Initialize AudioCapture with the event handler
         audio_capture = AudioCapture(

diff --git a/samples/sample_realtime_ai_text_input.py b/samples/sample_realtime_ai_text_input.py
@@ -189,8 +189,12 @@ def on_response_output_item_added(self, event: ResponseOutputItemAdded):
             function_name = event.item.get("name")
             if call_id and function_name:
                 with self._lock:
-                    self._call_id_to_function_name[call_id] = function_name
-                logger.debug(f"Registered function call. Call ID: {call_id}, Function Name: {function_name}")
+                    # Only register the call_id if we haven't seen it before
+                    if call_id in self._call_id_to_function_name:
+                        logger.debug(f"Ignoring duplicated function call registration for call_id: {call_id}")
+                    else:
+                        self._call_id_to_function_name[call_id] = function_name
+                        logger.debug(f"Registered new function call. Call ID: {call_id}, Function Name: {function_name}")
             else:
                 logger.warning("Function call item missing 'call_id' or 'name' fields.")
 

diff --git a/samples/sample_realtime_ai_with_keyword_and_vad.py b/samples/sample_realtime_ai_with_keyword_and_vad.py
@@ -251,8 +251,12 @@ def on_response_output_item_added(self, event: ResponseOutputItemAdded):
             function_name = event.item.get("name")
             if call_id and function_name:
                 with self._lock:
-                    self._call_id_to_function_name[call_id] = function_name
-                logger.debug(f"Registered function call. Call ID: {call_id}, Function Name: {function_name}")
+                    # Only register the call_id if we haven't seen it before
+                    if call_id in self._call_id_to_function_name:
+                        logger.debug(f"Ignoring duplicated function call registration for call_id: {call_id}")
+                    else:
+                        self._call_id_to_function_name[call_id] = function_name
+                        logger.debug(f"Registered new function call. Call ID: {call_id}, Function Name: {function_name}")
             else:
                 logger.warning("Function call item missing 'call_id' or 'name' fields.")
 
@@ -390,19 +394,29 @@ def main():
             client=client,
             event_handler=event_handler
         )
-        vad_parameters={
+
+        # Set local VAD parameters depending on the VAD model used
+        if USE_SILERO_VAD_MODEL:
+            logger.info("Using Silero VAD...")
+            vad_parameters = {
+                "sample_rate": 24000,
+                "chunk_size": 1024,
+                "window_size_samples": 512,
+                "threshold": 0.25,
+                "min_speech_duration": 0.3,
+                "min_silence_duration": 1.0,
+                "model_path": str(RESOURCES_DIR / "silero_vad.onnx")
+            }
+        else:
+            logger.info("Using VoiceActivityDetector...")
+            vad_parameters = {
                 "sample_rate": 24000,
                 "chunk_size": 1024,
                 "window_duration": 1.5,
                 "silence_ratio": 1.5,
                 "min_speech_duration": 0.3,
                 "min_silence_duration": 1.0
             }
-        if USE_SILERO_VAD_MODEL:
-            logger.info("using Silero VAD...")
-            vad_parameters["model_path"] = str(RESOURCES_DIR / "silero_vad.onnx")
-        else:
-            logger.info("using VoiceActivityDetector...")
 
         # Initialize AudioCapture with the event handler
         audio_capture = AudioCapture(

diff --git a/samples/sample_realtime_ai_with_local_vad.py b/samples/sample_realtime_ai_with_local_vad.py
@@ -189,8 +189,12 @@ def on_response_output_item_added(self, event: ResponseOutputItemAdded):
             function_name = event.item.get("name")
             if call_id and function_name:
                 with self._lock:
-                    self._call_id_to_function_name[call_id] = function_name
-                logger.debug(f"Registered function call. Call ID: {call_id}, Function Name: {function_name}")
+                    # Only register the call_id if we haven't seen it before
+                    if call_id in self._call_id_to_function_name:
+                        logger.debug(f"Ignoring duplicated function call registration for call_id: {call_id}")
+                    else:
+                        self._call_id_to_function_name[call_id] = function_name
+                        logger.debug(f"Registered new function call. Call ID: {call_id}, Function Name: {function_name}")
             else:
                 logger.warning("Function call item missing 'call_id' or 'name' fields.")
 
@@ -304,19 +308,29 @@ def main():
             client=client,
             event_handler=event_handler
         )
-        vad_parameters={
+
+        # Set local VAD parameters depending on the VAD model used
+        if USE_SILERO_VAD_MODEL:
+            logger.info("Using Silero VAD...")
+            vad_parameters = {
+                "sample_rate": 24000,
+                "chunk_size": 1024,
+                "window_size_samples": 512,
+                "threshold": 0.25,
+                "min_speech_duration": 0.3,
+                "min_silence_duration": 1.0,
+                "model_path": str(RESOURCES_DIR / "silero_vad.onnx")
+            }
+        else:
+            logger.info("Using VoiceActivityDetector...")
+            vad_parameters = {
                 "sample_rate": 24000,
                 "chunk_size": 1024,
                 "window_duration": 1.5,
                 "silence_ratio": 1.5,
                 "min_speech_duration": 0.3,
                 "min_silence_duration": 1.0
             }
-        if USE_SILERO_VAD_MODEL:
-            logger.info("using Silero VAD...")
-            vad_parameters["model_path"] = str(RESOURCES_DIR / "silero_vad.onnx")
-        else:
-            logger.info("using VoiceActivityDetector...")
 
         # Initialize AudioCapture with the event handler
         audio_capture = AudioCapture(

diff --git a/samples/utils/audio_capture.py b/samples/utils/audio_capture.py
@@ -105,7 +105,7 @@ def __init__(
 
         if vad_parameters is not None:
             try:
-                if vad_parameters.get("model_path"):
+                if "model_path" in vad_parameters and isinstance(vad_parameters["model_path"], str) and vad_parameters["model_path"].strip():
                     self.vad = SileroVoiceActivityDetector(**vad_parameters)
                 else:
                     self.vad = VoiceActivityDetector(**vad_parameters)
@@ -160,7 +160,7 @@ def start(self):
             except Exception as e:
                 logger.error(f"Failed to start AzureKeywordRecognizer: {e}")
 
-        # ensure the pyaudio instance is initialized
+        # Ensure the PyAudio instance is initialized
         if not self.pyaudio_instance:
             self.pyaudio_instance = pyaudio.PyAudio()
 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="realtime-ai",
-    version="0.1.8",
+    version="0.1.9",
     description="Python SDK for real-time audio processing with OpenAI's Realtime REST API.",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
@@ -12,10 +12,9 @@
     package_dir={"": "src"},
     install_requires=[
         "pyaudio",
-        "numpy",
+        "numpy>=1.24,<2.2",  # Ensures compatibility with numba
         "websockets",
         "websocket-client",
-        "azure-cognitiveservices-speech",
     ],
     classifiers=[
         "Programming Language :: Python :: 3",