Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First Commit - #9

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ Edit config.py to select the models you want to use:
# Model selection
TRANSCRIPTION_MODEL = 'groq' # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local'
RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'local'
TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts'
TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'cartesia', 'local', 'melotts', 'fastxttsapi'

# API keys and paths
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
Expand Down Expand Up @@ -130,6 +130,9 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
```shell
fastapi run main.py
```
> **Note:** To allow Verbi to query the FastWhisperAPI for transcription, ensure that the FastAPI server is running in a separate terminal in the background.


***Alternative Setup and Run Methods***

The API can also run directly on a Docker container or in Google Colab.
Expand Down Expand Up @@ -166,6 +169,22 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
```
You can run the main file to start using verbi with local models.

9. 🔊 **Alternative Local TTS - Coqui XTTS v2**

_Optional step if you need a local Text to Speech model_

***Install Coqui XTTS from Github***

To set up the TTS server, follow the instructions in the [FastXttsAPI](https://github.com/3choff/FastXttsAPI)

***Usage***

The multilingual TTS model is queried through a FastAPI app that provides an endpoint "/v1/speech" to generate speech, which can provide both stream and non-stream responses. To get a list of all the 62 studio voices available, query the endpoint "/voices". The best performance is achieved when you run the model in a Docker container, but it is also possible to run the server in the [Google Colab](https://github.com/3choff/FastXttsAPI/blob/main/FastXttsAPI_notebook.ipynb) provided in the repository.

> **Note:** To allow Verbi to query the FastXttsAPI for speech synthesis, ensure that the FastAPI server is running in a separate terminal in the background.

> **Fun Tip:** Explore voice cloning with FastXttsAPI! You can clone a voice using an audio clip of at least 10 seconds. Simply add the voice’s embedding to the 'studio_speakers' folder in FastXttsAPI, and enjoy interacting with a personalized Verbi chatbot.

## Model Options ⚙️

#### Transcription Models 🎤
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ python-dotenv
colorama
requests
keyboard
elevenlabs
fastapi
uvicorn
numpy
Expand Down
9 changes: 5 additions & 4 deletions run_voice_assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ def main():
text_to_speech(Config.TTS_MODEL, tts_api_key, response_text, output_file, Config.LOCAL_MODEL_PATH)

# Play the generated speech audio
play_audio(output_file)

# Clean up audio files
# delete_file(Config.INPUT_AUDIO)
# delete_file(output_file)
if Config.TTS_MODEL not in ['fastxttsapi', 'elevenlabs']:
play_audio(output_file)
# Clean up audio files
# delete_file(Config.INPUT_AUDIO)
# delete_file(output_file)

except Exception as e:
logging.error(Fore.RED + f"An error occurred: {e}" + Fore.RESET)
Expand Down
2 changes: 2 additions & 0 deletions voice_assistant/api_key_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@ def get_tts_api_key():
return Config.DEEPGRAM_API_KEY
elif Config.TTS_MODEL == 'elevenlabs':
return Config.ELEVENLABS_API_KEY
elif Config.TTS_MODEL == 'cartesia':
return Config.CARTESIA_API_KEY
return None
37 changes: 36 additions & 1 deletion voice_assistant/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,16 @@
import time
import logging
import pydub
import subprocess
import shutil
from io import BytesIO
from pydub import AudioSegment


def is_installed(command):
from shutil import which
return which(command) is not None

def record_audio(file_path, timeout=10, phrase_time_limit=None, retries=3, energy_threshold=2000, pause_threshold=1, phrase_threshold=0.1, dynamic_energy_threshold=True, calibration_duration=1):
"""
Record audio from the microphone and save it as an MP3 file.
Expand Down Expand Up @@ -67,4 +74,32 @@ def play_audio(file_path):
except pygame.error as e:
logging.error(f"Failed to play audio: {e}")
except Exception as e:
logging.error(f"An unexpected error occurred while playing audio: {e}")
logging.error(f"An unexpected error occurred while playing audio: {e}")
def play_audio_stream(audio_stream):
"""
Play an audio stream using ffplay.

Args:
audio_stream (generator): The audio stream to play.
"""
# Use subprocess to pipe the audio data to ffplay and play it
if not is_installed("ffplay"):
raise ValueError("ffplay not found, necessary to stream audio.")
ffplay_cmd = ['ffplay', '-probesize', '512', '-autoexit', '-', "-nodisp"]
ffplay_proc = subprocess.Popen(
ffplay_cmd,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)

try:
for chunk in audio_stream:
ffplay_proc.stdin.write(chunk)
# print("Received and played a chunk")
except Exception as e:
logging.error(f"An error occurred: {e}")
finally:
if ffplay_proc.stdin:
ffplay_proc.stdin.close()
ffplay_proc.wait()
10 changes: 5 additions & 5 deletions voice_assistant/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Config:
Attributes:
TRANSCRIPTION_MODEL (str): The model to use for transcription ('openai', 'groq', 'deepgram', 'fastwhisperapi', 'local').
RESPONSE_MODEL (str): The model to use for response generation ('openai', 'groq', 'local').
TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'local').
TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local').
OPENAI_API_KEY (str): API key for OpenAI services.
GROQ_API_KEY (str): API key for Groq services.
DEEPGRAM_API_KEY (str): API key for Deepgram services.
Expand All @@ -22,8 +22,8 @@ class Config:
"""
# Model selection
TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi
RESPONSE_MODEL = 'ollama' # possible values: openai, groq, ollama
TTS_MODEL = 'deepgram' # possible values: openai, deepgram, elevenlabs, melotts, cartesia
RESPONSE_MODEL = 'groq' # possible values: openai, groq, ollama
TTS_MODEL = 'fastxttsapi' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, fastxttsapi

# currently using the MeloTTS for local models. here is how to get started:
# https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#linux-and-macos-install
Expand Down Expand Up @@ -59,8 +59,8 @@ def validate_config():
raise ValueError("Invalid TRANSCRIPTION_MODEL. Must be one of ['openai', 'groq', 'deepgram', 'fastwhisperapi', 'local']")
if Config.RESPONSE_MODEL not in ['openai', 'groq', 'ollama', 'local']:
raise ValueError("Invalid RESPONSE_MODEL. Must be one of ['openai', 'groq', 'local']")
if Config.TTS_MODEL not in ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']:
raise ValueError("Invalid TTS_MODEL. Must be one of ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']")
if Config.TTS_MODEL not in ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local']:
raise ValueError("Invalid TTS_MODEL. Must be one of ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local']")

if Config.TRANSCRIPTION_MODEL == 'openai' and not Config.OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY is required for OpenAI models")
Expand Down
42 changes: 33 additions & 9 deletions voice_assistant/text_to_speech.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# voice_assistant/text_to_speech.py
import logging
import elevenlabs
import requests

from openai import OpenAI
from deepgram import DeepgramClient, SpeakOptions
from elevenlabs.client import ElevenLabs
from cartesia.tts import CartesiaTTS
from voice_assistant.audio import play_audio_stream
import soundfile as sf
import json

Expand All @@ -17,7 +17,7 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None
Convert text to speech using the specified model.

Args:
model (str): The model to use for TTS ('openai', 'deepgram', 'elevenlabs', 'local').
model (str): The model to use for TTS ('openai', 'deepgram', 'elevenlabs', 'cartesia', 'melotts', 'fastxttsapi', 'local').
api_key (str): The API key for the TTS service.
text (str): The text to convert to speech.
output_file_path (str): The path to save the generated speech audio file.
Expand Down Expand Up @@ -48,11 +48,24 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None
response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)
elif model == 'elevenlabs':
ELEVENLABS_VOICE_ID = "Paul J."
client = ElevenLabs(api_key=api_key)
audio = client.generate(
text=text, voice=ELEVENLABS_VOICE_ID, output_format="mp3_22050_32", model="eleven_turbo_v2"
)
elevenlabs.save(audio, output_file_path)
ELEVENLABS_URL = f'https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}/stream'
headers = {
'accept': '*/*',
'xi-api-key': api_key,
'Content-Type': 'application/json'
}
data = {
'text': text,
'voice_settings': {
'stability': 0.50,
'similarity_boost': 0.75
},
"output_format": "mp3_22050_32"
}

with requests.post(ELEVENLABS_URL, headers=headers, json=data, stream=True) as r:
audio_stream = r.iter_content(chunk_size=512)
play_audio_stream(audio_stream)
elif model == "cartesia":
# config
with open('Barbershop Man.json') as f:
Expand All @@ -73,9 +86,20 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None
rate = output["sampling_rate"]
sf.write(output_file_path, buffer, rate)


elif model == "melotts": # this is a local model
generate_audio_file_melotts(text=text, filename=output_file_path)
elif model == "fastxttsapi":
# Set the URL for the FastXTTS API, change with the address of where the API is running either locally or on a server
FASTXTTSAPI_URL = 'https://localhost:8000'
payload = {
"text": text,
"language": "en",
"voice": "Dionisio Schuyler", #Query the endpoint https://localhost:8000/voices to get the list of available voices
"stream": True,
}
with requests.post(FASTXTTSAPI_URL + "/v1/speech", json=payload, verify=False) as r:
audio_stream = r.iter_content(chunk_size=512)
play_audio_stream(audio_stream)
elif model == 'local':
# Placeholder for local TTS model
with open(output_file_path, "wb") as f:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file added voice_samples/Xtts_sample1.wav
Binary file not shown.