Skip to content

Commit

Permalink
feat TTS synthAduio from API (#252)
Browse files Browse the repository at this point in the history
* feat TTS synthAduio from API

* fix failing testcase

* wip

* wip

* wip

* wip

* uip

* wip

* fix

* fix

* swagger
  • Loading branch information
xquanluu authored Nov 6, 2023
1 parent 1e9f388 commit f725d5f
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 104 deletions.
4 changes: 3 additions & 1 deletion app.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ const {
const {
getTtsVoices,
getTtsSize,
purgeTtsCache
purgeTtsCache,
synthAudio
} = require('@jambonz/speech-utils')(JAMBONES_REDIS_SENTINELS || {
host: process.env.JAMBONES_REDIS_HOST,
port: process.env.JAMBONES_REDIS_PORT || 6379
Expand Down Expand Up @@ -98,6 +99,7 @@ app.locals = {
getTtsVoices,
getTtsSize,
purgeTtsCache,
synthAudio,
lookupAppBySid,
lookupAccountBySid,
lookupAccountByPhoneNumber,
Expand Down
1 change: 1 addition & 0 deletions lib/routes/api/accounts.js
Original file line number Diff line number Diff line change
Expand Up @@ -1038,4 +1038,5 @@ router.get('/:sid/Queues', async(req, res) => {
}
});


module.exports = router;
76 changes: 2 additions & 74 deletions lib/routes/api/speech-credentials.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ const assert = require('assert');
const Account = require('../../models/account');
const SpeechCredential = require('../../models/speech-credential');
const sysError = require('../error');
const {decrypt, encrypt, obscureKey} = require('../../utils/encrypt-decrypt');
const {decrypt, encrypt} = require('../../utils/encrypt-decrypt');
const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils');
const {decryptCredential} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden} = require('../../utils/errors');
const {
testGoogleTts,
Expand Down Expand Up @@ -274,79 +275,6 @@ router.post('/', async(req, res) => {
}
});

function decryptCredential(obj, credential, logger) {
if ('google' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
const key_header = '-----BEGIN PRIVATE KEY-----\n';
const obscured = {
...o,
private_key: `${key_header}${obscureKey(o.private_key.slice(key_header.length, o.private_key.length))}`
};
obj.service_key = JSON.stringify(obscured);
}
else if ('aws' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.access_key_id = o.access_key_id;
obj.secret_access_key = obscureKey(o.secret_access_key);
obj.aws_region = o.aws_region;
logger.info({obj, o}, 'retrieving aws speech credential');
}
else if ('microsoft' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = obscureKey(o.api_key);
obj.region = o.region;
obj.use_custom_tts = o.use_custom_tts;
obj.custom_tts_endpoint = o.custom_tts_endpoint;
obj.custom_tts_endpoint_url = o.custom_tts_endpoint_url;
obj.use_custom_stt = o.use_custom_stt;
obj.custom_stt_endpoint = o.custom_stt_endpoint;
obj.custom_stt_endpoint_url = o.custom_stt_endpoint_url;
logger.info({obj, o}, 'retrieving azure speech credential');
}
else if ('wellsaid' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = obscureKey(o.api_key);
}
else if ('nuance' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.client_id = o.client_id;
obj.secret = o.secret ? obscureKey(o.secret) : null;
}
else if ('deepgram' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = obscureKey(o.api_key);
}
else if ('ibm' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.tts_api_key = obscureKey(o.tts_api_key);
obj.tts_region = o.tts_region;
obj.stt_api_key = obscureKey(o.stt_api_key);
obj.stt_region = o.stt_region;
obj.instance_id = o.instance_id;
} else if ('nvidia' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.riva_server_uri = o.riva_server_uri;
} else if ('cobalt' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.cobalt_server_uri = o.cobalt_server_uri;
} else if ('soniox' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = obscureKey(o.api_key);
} else if ('elevenlabs' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = obscureKey(o.api_key);
obj.model_id = o.model_id;
} else if (obj.vendor.startsWith('custom:')) {
const o = JSON.parse(decrypt(credential));
obj.auth_token = obscureKey(o.auth_token);
obj.custom_stt_url = o.custom_stt_url;
obj.custom_tts_url = o.custom_tts_url;
} else if ('assemblyai' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = obscureKey(o.api_key);
}
}

/**
* retrieve all speech credentials for an account
*/
Expand Down
82 changes: 82 additions & 0 deletions lib/routes/api/tts-cache.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@ const router = require('express').Router();
const {
parseAccountSid
} = require('./utils');
const SpeechCredential = require('../../models/speech-credential');
const fs = require('fs');
const { v4: uuidv4 } = require('uuid');
const {DbErrorBadRequest} = require('../../utils/errors');
const Account = require('../../models/account');
const sysError = require('../error');
const { getSpeechCredential, decryptCredential } = require('../../utils/speech-utils');

router.delete('/', async(req, res) => {
const {purgeTtsCache} = req.app.locals;
Expand All @@ -26,4 +33,79 @@ router.get('/', async(req, res) => {
res.status(200).json({size});
});

router.post('/Synthesize', async(req, res) => {
const {logger, synthAudio} = req.app.locals;
try {
const accountSid = parseAccountSid(req);
const body = req.body;
if (!body.speech_credential_sid || !body.text || !body.language || !body.voice) {
throw new DbErrorBadRequest('speech_credential_sid, text, language, voice are all required');
}

const result = await Account.retrieve(accountSid);
if (!result || result.length === 0 || !result[0].is_active) {
throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`);
}
const credentials = await SpeechCredential.retrieve(body.speech_credential_sid);
if (!credentials || credentials.length === 0) {
throw new
DbErrorBadRequest(`There is no available speech credential for ${body.speech_credential_sid}`);
}
const {credential, ...obj} = credentials[0];

decryptCredential(obj, credential, logger, false);
const cred = getSpeechCredential(obj, logger);

const { text, language, engine = 'standard' } = body;
const salt = uuidv4();
/* parse Nuance voices into name and model */
let voice = body.voice;
let model;
if (cred.vendor === 'nuance' && voice) {
const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice);
if (arr) {
voice = arr[1];
model = arr[2];
}
}
const stats = {
histogram: () => {},
increment: () => {},
};
const { filePath } = await synthAudio(stats, {
account_sid: accountSid,
text,
vendor: cred.vendor,
language,
voice,
engine,
model,
salt,
credentials: cred,
disableTtsCache: false
});

const stat = fs.statSync(filePath);
res.writeHead(200, {
'Content-Type': 'audio/mpeg',
'Content-Length': stat.size,
});

const readStream = fs.createReadStream(filePath);
// We replaced all the event handlers with a simple call to readStream.pipe()
readStream.pipe(res);

readStream.on('end', () => {
// Delete the file after it's been read
fs.unlink(filePath, (err) => {
if (err) throw err;
logger.info(`${filePath} was deleted`);
});
});

} catch (err) {
sysError(logger, res, err);
}
});

module.exports = router;
59 changes: 59 additions & 0 deletions lib/swagger/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4209,6 +4209,65 @@ paths:
application/json:
schema:
$ref: '#/components/schemas/RegisteredClient'
/Accounts/{AccountSid}/TtsCache/Synthesize:
post:
tags:
- Accounts
summary: get TTS from provider
operationId: Synthesize
requestBody:
content:
application/json:
schema:
type: object
properties:
speech_credential_sid:
type: string
description: Speech credential Sid
example: 553b4b6b-8918-4394-a46d-1e3c5a3c717b
text:
type: string
description: the text to convert to audio
example: Hello How are you
language:
type: string
description: language is used in text
example: en-US
voice:
type: string
description: voice ID
example: en-US-Standard-C
required:
- speech_credential_sid
- text
- language
- voice
responses:
200:
description: Audio is created
content:
audio/mpeg:
schema:
type: string
format: binary
400:
description: bad request
content:
application/json:
schema:
$ref: '#/components/schemas/GeneralError'
422:
description: unprocessable entity
content:
application/json:
schema:
$ref: '#/components/schemas/GeneralError'
500:
description: system error
content:
application/json:
schema:
$ref: '#/components/schemas/GeneralError'
/Lcrs:
post:
tags:
Expand Down
106 changes: 105 additions & 1 deletion lib/utils/speech-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node');
const bent = require('bent');
const fs = require('fs');
const { AssemblyAI } = require('assemblyai');
const {decrypt, obscureKey} = require('./encrypt-decrypt');


const testSonioxStt = async(logger, credentials) => {
Expand Down Expand Up @@ -287,6 +288,107 @@ const testAssemblyStt = async(logger, credentials) => {
});
};

const getSpeechCredential = (credential, logger) => {
const {vendor} = credential;
logger.info(
`Speech vendor: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} selected`);
if ('google' === vendor) {
try {
const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n'));
return {
...credential,
credentials: cred
};
} catch (err) {
logger.info({err}, `malformed google service_key provisioned for account ${credential.speech_credential_sid}`);
}
}
else if (['aws', 'polly'].includes(vendor)) {
return {
...credential,
accessKeyId: credential.access_key_id,
secretAccessKey: credential.secret_access_key,
region: credential.aws_region || 'us-east-1'
};
}
return credential;
};

function decryptCredential(obj, credential, logger, isObscureKey = true) {
if ('google' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
const key_header = '-----BEGIN PRIVATE KEY-----\n';
const obscured = {
...o,
private_key: `${key_header}${isObscureKey ?
obscureKey(o.private_key.slice(key_header.length, o.private_key.length)) :
o.private_key.slice(key_header.length, o.private_key.length)}`
};
obj.service_key = JSON.stringify(obscured);
}
else if ('aws' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.access_key_id = o.access_key_id;
obj.secret_access_key = isObscureKey ? obscureKey(o.secret_access_key) : o.secret_access_key;
obj.aws_region = o.aws_region;
logger.info({obj, o}, 'retrieving aws speech credential');
}
else if ('microsoft' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.region = o.region;
obj.use_custom_tts = o.use_custom_tts;
obj.custom_tts_endpoint = o.custom_tts_endpoint;
obj.custom_tts_endpoint_url = o.custom_tts_endpoint_url;
obj.use_custom_stt = o.use_custom_stt;
obj.custom_stt_endpoint = o.custom_stt_endpoint;
obj.custom_stt_endpoint_url = o.custom_stt_endpoint_url;
logger.info({obj, o}, 'retrieving azure speech credential');
}
else if ('wellsaid' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
}
else if ('nuance' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.client_id = o.client_id;
obj.secret = o.secret ? (isObscureKey ? obscureKey(o.secret) : o.secret) : null;
}
else if ('deepgram' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
}
else if ('ibm' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.tts_api_key = isObscureKey ? obscureKey(o.tts_api_key) : o.tts_api_key;
obj.tts_region = o.tts_region;
obj.stt_api_key = isObscureKey ? obscureKey(o.stt_api_key) : o.stt_api_key;
obj.stt_region = o.stt_region;
obj.instance_id = o.instance_id;
} else if ('nvidia' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.riva_server_uri = o.riva_server_uri;
} else if ('cobalt' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.cobalt_server_uri = o.cobalt_server_uri;
} else if ('soniox' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
} else if ('elevenlabs' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
} else if (obj.vendor.startsWith('custom:')) {
const o = JSON.parse(decrypt(credential));
obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token;
obj.custom_stt_url = o.custom_stt_url;
obj.custom_tts_url = o.custom_tts_url;
} else if ('assemblyai' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
}
}

module.exports = {
testGoogleTts,
testGoogleStt,
Expand All @@ -303,5 +405,7 @@ module.exports = {
testIbmStt,
testSonioxStt,
testElevenlabs,
testAssemblyStt
testAssemblyStt,
getSpeechCredential,
decryptCredential
};
Loading

0 comments on commit f725d5f

Please sign in to comment.