-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtts_from_list.py
executable file
·103 lines (80 loc) · 3.38 KB
/
tts_from_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import base64
import json
import os
import sys
from google.cloud import texttospeech as gcp_tts
def log(x):
print(x, file=sys.stderr, flush=True)
def text_to_speech(word):
testing = False
if testing and word != "wide":
audio = base64.b64encode("hello".encode("utf-8")).decode("utf-8")
return {"male": audio, "female": audio}
else:
# Instantiates a client
client = gcp_tts.TextToSpeechClient()
# Set the text input to be synthesized
synthesis_input = gcp_tts.SynthesisInput(text=word)
# Build the voice request, select the language code ("en-US") and the named voice
# from https://cloud.google.com/text-to-speech/docs/voices
# Male voices: en-US-Wavenet-A, en-US-Wavenet-B, en-US-Wavenet-D, en-US-Wavenet-I, en-US-Wavenet-J
# Female voices: en-US-Wavenet-C, en-US-Wavenet-E, en-US-Wavenet-F, en-US-Wavenet-G, en-US-Wavenet-H
# good choices from https://cloud.google.com/text-to-speech samples
# gender / voice / speaking_rate / pitch
# M / B / 0.8 / -2
# F / C / 0.8 / +6
# F / F / 0.8 / +2
# F / H / 0.8 / -2
voice_male = gcp_tts.VoiceSelectionParams(language_code="en-US", name="en-US-Wavenet-B")
voice_female = gcp_tts.VoiceSelectionParams(language_code="en-US", name="en-US-Wavenet-H")
# Select the type of audio file you want returned
audio_config = gcp_tts.AudioConfig(audio_encoding=gcp_tts.AudioEncoding.MP3, speaking_rate=0.8, pitch=-2.0)
male_response = client.synthesize_speech(input=synthesis_input, voice=voice_male, audio_config=audio_config)
female_response = client.synthesize_speech(input=synthesis_input, voice=voice_female, audio_config=audio_config)
male_audio = base64.b64encode(male_response.audio_content).decode("utf-8")
female_audio = base64.b64encode(female_response.audio_content).decode("utf-8")
return {"male": male_audio, "female": female_audio}
def word_to_path(word):
word_fn = word.replace(" ", "-").lower()
word_prefix = word[0].lower()
return f"words/audio/{word_prefix}/{word_fn}.json"
def get_cached_audio(word):
audio_path = word_to_path(word)
try:
with open(audio_path, "r") as f:
audio = json.load(f)
log(f"Got cache for {word}")
return audio
except:
log(f"Failed to find cache for {word} at {audio_path}")
pass
def cache_audio(word, data):
audio_path = word_to_path(word)
with open(audio_path, "w") as f:
json.dump(data, f, indent=4)
word_files = [
"wordlist.1b.txt",
"wordlist.2b.txt",
"wordlist.3b.txt",
]
current_year_path = "words/year/2024"
wordlist = []
for word_file in word_files:
with open(os.path.join(current_year_path, word_file)) as f:
for line in f:
working_line = line.strip()
wordlist.append(working_line)
audio = []
for word in sorted(wordlist):
parts = word.split("/", 1)
log(f"Processing {parts[0]}... ")
speech_data = get_cached_audio(parts[0])
if speech_data is None:
speech_data = text_to_speech(parts[0])
cache_audio(parts[0], speech_data)
if len(parts) == 1:
audio.append({"word": parts[0], "audio": speech_data})
else:
audio.append({"word": parts[0], "alt": parts[1].split("/"), "audio": speech_data})
log("Done")
print(json.dumps(audio))