PromtEngineer · 3choff · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ Edit config.py to select the models you want to use:
         # Model selection
         TRANSCRIPTION_MODEL = 'groq'  # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local'
         RESPONSE_MODEL = 'groq'       # Options: 'openai', 'groq', 'ollama', 'local'
-        TTS_MODEL = 'deepgram'        # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts'
+        TTS_MODEL = 'deepgram'        # Options: 'openai', 'deepgram', 'elevenlabs', 'cartesia', 'local', 'melotts', 'fastxttsapi'
 
         # API keys and paths
         OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -130,6 +130,9 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
    ```shell
       fastapi run main.py
    ```
+   > **Note:** To allow Verbi to query the FastWhisperAPI for transcription, ensure that the FastAPI server is running in a separate terminal in the background.
+
+
    ***Alternative Setup and Run Methods***
 
    The API can also run directly on a Docker container or in Google Colab.
@@ -166,6 +169,22 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
    ```
    You can run the main file to start using verbi with local models. 
 
+9. 🔊 **Alternative Local TTS - Coqui XTTS v2**
+
+   _Optional step if you need a local Text to Speech model_
+
+   ***Install Coqui XTTS from Github***
+
+   To set up the TTS server, follow the instructions in the [FastXttsAPI](https://github.com/3choff/FastXttsAPI)
+
+   ***Usage***
+
+   The multilingual TTS model is queried through a FastAPI app that provides an endpoint "/v1/speech" to generate speech, which can provide both stream and non-stream responses. To get a list of all the 62 studio voices available, query the endpoint "/voices". The best performance is achieved when you run the model in a Docker container, but it is also possible to run the server in the [Google Colab](https://github.com/3choff/FastXttsAPI/blob/main/FastXttsAPI_notebook.ipynb) provided in the repository.
+
+   > **Note:** To allow Verbi to query the FastXttsAPI for speech synthesis, ensure that the FastAPI server is running in a separate terminal in the background.
+
+   > **Fun Tip:** Explore voice cloning with FastXttsAPI! You can clone a voice using an audio clip of at least 10 seconds. Simply add the voice’s embedding to the 'studio_speakers' folder in FastXttsAPI, and enjoy interacting with a personalized Verbi chatbot.
+
 ## Model Options ⚙️
 
 #### Transcription Models  🎤

diff --git a/requirements.txt b/requirements.txt
@@ -26,7 +26,6 @@ python-dotenv
 colorama
 requests
 keyboard
-elevenlabs
 fastapi
 uvicorn
 numpy 

diff --git a/run_voice_assistant.py b/run_voice_assistant.py
@@ -72,11 +72,12 @@ def main():
             text_to_speech(Config.TTS_MODEL, tts_api_key, response_text, output_file, Config.LOCAL_MODEL_PATH)
 
             # Play the generated speech audio
-            play_audio(output_file)
 
-            # Clean up audio files
-            # delete_file(Config.INPUT_AUDIO)
-            # delete_file(output_file)
+            if Config.TTS_MODEL not in ['fastxttsapi', 'elevenlabs']:
+                play_audio(output_file)
+                # Clean up audio files
+                # delete_file(Config.INPUT_AUDIO)
+                # delete_file(output_file)
 
         except Exception as e:
             logging.error(Fore.RED + f"An error occurred: {e}" + Fore.RESET)

diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py
@@ -43,4 +43,6 @@ def get_tts_api_key():
         return Config.DEEPGRAM_API_KEY
     elif Config.TTS_MODEL == 'elevenlabs':
         return Config.ELEVENLABS_API_KEY
+    elif Config.TTS_MODEL == 'cartesia':
+        return Config.CARTESIA_API_KEY
     return None
diff --git a/voice_assistant/audio.py b/voice_assistant/audio.py
@@ -5,9 +5,16 @@
 import time
 import logging
 import pydub
+import subprocess
+import shutil
 from io import BytesIO
 from pydub import AudioSegment
 
+
+def is_installed(command):
+        from shutil import which
+        return which(command) is not None
+
 def record_audio(file_path, timeout=10, phrase_time_limit=None, retries=3, energy_threshold=2000, pause_threshold=1, phrase_threshold=0.1, dynamic_energy_threshold=True, calibration_duration=1):
     """
     Record audio from the microphone and save it as an MP3 file.
@@ -67,4 +74,32 @@ def play_audio(file_path):
     except pygame.error as e:
         logging.error(f"Failed to play audio: {e}")
     except Exception as e:
-        logging.error(f"An unexpected error occurred while playing audio: {e}")
+        logging.error(f"An unexpected error occurred while playing audio: {e}")
+def play_audio_stream(audio_stream):
+    """
+    Play an audio stream using ffplay.
+
+    Args:
+    audio_stream (generator): The audio stream to play.
+    """
+    # Use subprocess to pipe the audio data to ffplay and play it
+    if not is_installed("ffplay"):
+        raise ValueError("ffplay not found, necessary to stream audio.")
+    ffplay_cmd = ['ffplay', '-probesize', '512', '-autoexit', '-', "-nodisp"]
+    ffplay_proc = subprocess.Popen(
+        ffplay_cmd, 
+        stdin=subprocess.PIPE, 
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+    try:
+        for chunk in audio_stream:
+            ffplay_proc.stdin.write(chunk)
+            # print("Received and played a chunk")
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+    finally:
+        if ffplay_proc.stdin:
+            ffplay_proc.stdin.close()
+        ffplay_proc.wait()
diff --git a/voice_assistant/config.py b/voice_assistant/config.py
@@ -13,7 +13,7 @@ class Config:
     Attributes:
     TRANSCRIPTION_MODEL (str): The model to use for transcription ('openai', 'groq', 'deepgram', 'fastwhisperapi', 'local').
     RESPONSE_MODEL (str): The model to use for response generation ('openai', 'groq', 'local').
-    TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'local').
+    TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local').
     OPENAI_API_KEY (str): API key for OpenAI services.
     GROQ_API_KEY (str): API key for Groq services.
     DEEPGRAM_API_KEY (str): API key for Deepgram services.
@@ -22,8 +22,8 @@ class Config:
     """
     # Model selection
     TRANSCRIPTION_MODEL = 'deepgram'  # possible values: openai, groq, deepgram, fastwhisperapi
-    RESPONSE_MODEL = 'ollama'  # possible values: openai, groq, ollama
-    TTS_MODEL = 'deepgram'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia
+    RESPONSE_MODEL = 'groq'  # possible values: openai, groq, ollama
+    TTS_MODEL = 'fastxttsapi'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia, fastxttsapi
 
     # currently using the MeloTTS for local models. here is how to get started:
     # https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#linux-and-macos-install
@@ -59,8 +59,8 @@ def validate_config():
             raise ValueError("Invalid TRANSCRIPTION_MODEL. Must be one of ['openai', 'groq', 'deepgram', 'fastwhisperapi', 'local']")
         if Config.RESPONSE_MODEL not in ['openai', 'groq', 'ollama', 'local']:
             raise ValueError("Invalid RESPONSE_MODEL. Must be one of ['openai', 'groq', 'local']")
-        if Config.TTS_MODEL not in ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']:
-            raise ValueError("Invalid TTS_MODEL. Must be one of ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']")
+        if Config.TTS_MODEL not in ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local']:
+            raise ValueError("Invalid TTS_MODEL. Must be one of ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local']")
 
         if Config.TRANSCRIPTION_MODEL == 'openai' and not Config.OPENAI_API_KEY:
             raise ValueError("OPENAI_API_KEY is required for OpenAI models")

diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
@@ -1,11 +1,11 @@
 # voice_assistant/text_to_speech.py
 import logging
-import elevenlabs
+import requests
 
 from openai import OpenAI
 from deepgram import DeepgramClient, SpeakOptions
-from elevenlabs.client import ElevenLabs
 from cartesia.tts import CartesiaTTS
+from voice_assistant.audio import play_audio_stream
 import soundfile as sf
 import json
 
@@ -17,7 +17,7 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None
     Convert text to speech using the specified model.
 
     Args:
-    model (str): The model to use for TTS ('openai', 'deepgram', 'elevenlabs', 'local').
+    model (str): The model to use for TTS ('openai', 'deepgram', 'elevenlabs', 'cartesia', 'melotts', 'fastxttsapi', 'local').
     api_key (str): The API key for the TTS service.
     text (str): The text to convert to speech.
     output_file_path (str): The path to save the generated speech audio file.
@@ -48,11 +48,24 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None
             response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)
         elif model == 'elevenlabs':
             ELEVENLABS_VOICE_ID = "Paul J."
-            client = ElevenLabs(api_key=api_key)
-            audio = client.generate(
-                text=text, voice=ELEVENLABS_VOICE_ID, output_format="mp3_22050_32", model="eleven_turbo_v2"
-            )
-            elevenlabs.save(audio, output_file_path)
+            ELEVENLABS_URL = f'https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}/stream'
+            headers = {
+                'accept': '*/*',
+                'xi-api-key': api_key,
+                'Content-Type': 'application/json'
+            }
+            data = {
+                'text': text,
+                'voice_settings': {
+                    'stability': 0.50,
+                    'similarity_boost': 0.75
+                },
+                "output_format": "mp3_22050_32" 
+            }
+
+            with requests.post(ELEVENLABS_URL, headers=headers, json=data, stream=True) as r:
+                audio_stream = r.iter_content(chunk_size=512)
+                play_audio_stream(audio_stream)
         elif model == "cartesia":
             # config
             with open('Barbershop Man.json') as f:
@@ -73,9 +86,20 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None
             rate = output["sampling_rate"]
             sf.write(output_file_path, buffer, rate) 
 
-
         elif model == "melotts": # this is a local model
             generate_audio_file_melotts(text=text, filename=output_file_path)
+        elif model == "fastxttsapi":
+            # Set the URL for the FastXTTS API, change with the address of where the API is running either locally or on a server
+            FASTXTTSAPI_URL = 'https://localhost:8000'
+            payload = {
+                "text": text,
+                "language": "en",  
+                "voice": "Dionisio Schuyler",  #Query the endpoint https://localhost:8000/voices to get the list of available voices
+                "stream": True,
+            }
+            with requests.post(FASTXTTSAPI_URL + "/v1/speech", json=payload, verify=False) as r:
+                audio_stream = r.iter_content(chunk_size=512)
+                play_audio_stream(audio_stream)
         elif model == 'local':
             # Placeholder for local TTS model
             with open(output_file_path, "wb") as f:

diff --git a/voice_samples/sample1.mp3 → voice_samples/ElevenLabs_sample1.mp3 b/voice_samples/sample1.mp3 → voice_samples/ElevenLabs_sample1.mp3
diff --git a/voice_samples/sample2.mp3 → voice_samples/ElevenLabs_sample2.mp3 b/voice_samples/sample2.mp3 → voice_samples/ElevenLabs_sample2.mp3
diff --git a/voice_samples/sample3.mp3 → voice_samples/ElevenLabs_sample3.mp3 b/voice_samples/sample3.mp3 → voice_samples/ElevenLabs_sample3.mp3
diff --git a/voice_samples/Xtts_sample1.wav b/voice_samples/Xtts_sample1.wav