diff --git a/app.js b/app.js index 43ce96a5..2686673b 100644 --- a/app.js +++ b/app.js @@ -52,7 +52,8 @@ const { const { getTtsVoices, getTtsSize, - purgeTtsCache + purgeTtsCache, + synthAudio } = require('@jambonz/speech-utils')(JAMBONES_REDIS_SENTINELS || { host: process.env.JAMBONES_REDIS_HOST, port: process.env.JAMBONES_REDIS_PORT || 6379 @@ -98,6 +99,7 @@ app.locals = { getTtsVoices, getTtsSize, purgeTtsCache, + synthAudio, lookupAppBySid, lookupAccountBySid, lookupAccountByPhoneNumber, diff --git a/lib/routes/api/accounts.js b/lib/routes/api/accounts.js index c5abb80b..5e962094 100644 --- a/lib/routes/api/accounts.js +++ b/lib/routes/api/accounts.js @@ -1038,4 +1038,5 @@ router.get('/:sid/Queues', async(req, res) => { } }); + module.exports = router; diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js index 3756b980..41f56739 100644 --- a/lib/routes/api/speech-credentials.js +++ b/lib/routes/api/speech-credentials.js @@ -3,8 +3,9 @@ const assert = require('assert'); const Account = require('../../models/account'); const SpeechCredential = require('../../models/speech-credential'); const sysError = require('../error'); -const {decrypt, encrypt, obscureKey} = require('../../utils/encrypt-decrypt'); +const {decrypt, encrypt} = require('../../utils/encrypt-decrypt'); const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils'); +const {decryptCredential} = require('../../utils/speech-utils'); const {DbErrorUnprocessableRequest, DbErrorForbidden} = require('../../utils/errors'); const { testGoogleTts, @@ -274,79 +275,6 @@ router.post('/', async(req, res) => { } }); -function decryptCredential(obj, credential, logger) { - if ('google' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - const key_header = '-----BEGIN PRIVATE KEY-----\n'; - const obscured = { - ...o, - private_key: `${key_header}${obscureKey(o.private_key.slice(key_header.length, o.private_key.length))}` - }; - obj.service_key = JSON.stringify(obscured); - } - else if ('aws' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.access_key_id = o.access_key_id; - obj.secret_access_key = obscureKey(o.secret_access_key); - obj.aws_region = o.aws_region; - logger.info({obj, o}, 'retrieving aws speech credential'); - } - else if ('microsoft' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - obj.region = o.region; - obj.use_custom_tts = o.use_custom_tts; - obj.custom_tts_endpoint = o.custom_tts_endpoint; - obj.custom_tts_endpoint_url = o.custom_tts_endpoint_url; - obj.use_custom_stt = o.use_custom_stt; - obj.custom_stt_endpoint = o.custom_stt_endpoint; - obj.custom_stt_endpoint_url = o.custom_stt_endpoint_url; - logger.info({obj, o}, 'retrieving azure speech credential'); - } - else if ('wellsaid' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } - else if ('nuance' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.client_id = o.client_id; - obj.secret = o.secret ? obscureKey(o.secret) : null; - } - else if ('deepgram' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } - else if ('ibm' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.tts_api_key = obscureKey(o.tts_api_key); - obj.tts_region = o.tts_region; - obj.stt_api_key = obscureKey(o.stt_api_key); - obj.stt_region = o.stt_region; - obj.instance_id = o.instance_id; - } else if ('nvidia' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.riva_server_uri = o.riva_server_uri; - } else if ('cobalt' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.cobalt_server_uri = o.cobalt_server_uri; - } else if ('soniox' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } else if ('elevenlabs' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - obj.model_id = o.model_id; - } else if (obj.vendor.startsWith('custom:')) { - const o = JSON.parse(decrypt(credential)); - obj.auth_token = obscureKey(o.auth_token); - obj.custom_stt_url = o.custom_stt_url; - obj.custom_tts_url = o.custom_tts_url; - } else if ('assemblyai' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } -} - /** * retrieve all speech credentials for an account */ diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 7bc5c371..9a0ff995 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -2,6 +2,13 @@ const router = require('express').Router(); const { parseAccountSid } = require('./utils'); +const SpeechCredential = require('../../models/speech-credential'); +const fs = require('fs'); +const { v4: uuidv4 } = require('uuid'); +const {DbErrorBadRequest} = require('../../utils/errors'); +const Account = require('../../models/account'); +const sysError = require('../error'); +const { getSpeechCredential, decryptCredential } = require('../../utils/speech-utils'); router.delete('/', async(req, res) => { const {purgeTtsCache} = req.app.locals; @@ -26,4 +33,79 @@ router.get('/', async(req, res) => { res.status(200).json({size}); }); +router.post('/Synthesize', async(req, res) => { + const {logger, synthAudio} = req.app.locals; + try { + const accountSid = parseAccountSid(req); + const body = req.body; + if (!body.speech_credential_sid || !body.text || !body.language || !body.voice) { + throw new DbErrorBadRequest('speech_credential_sid, text, language, voice are all required'); + } + + const result = await Account.retrieve(accountSid); + if (!result || result.length === 0 || !result[0].is_active) { + throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`); + } + const credentials = await SpeechCredential.retrieve(body.speech_credential_sid); + if (!credentials || credentials.length === 0) { + throw new + DbErrorBadRequest(`There is no available speech credential for ${body.speech_credential_sid}`); + } + const {credential, ...obj} = credentials[0]; + + decryptCredential(obj, credential, logger, false); + const cred = getSpeechCredential(obj, logger); + + const { text, language, engine = 'standard' } = body; + const salt = uuidv4(); + /* parse Nuance voices into name and model */ + let voice = body.voice; + let model; + if (cred.vendor === 'nuance' && voice) { + const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); + if (arr) { + voice = arr[1]; + model = arr[2]; + } + } + const stats = { + histogram: () => {}, + increment: () => {}, + }; + const { filePath } = await synthAudio(stats, { + account_sid: accountSid, + text, + vendor: cred.vendor, + language, + voice, + engine, + model, + salt, + credentials: cred, + disableTtsCache: false + }); + + const stat = fs.statSync(filePath); + res.writeHead(200, { + 'Content-Type': 'audio/mpeg', + 'Content-Length': stat.size, + }); + + const readStream = fs.createReadStream(filePath); + // We replaced all the event handlers with a simple call to readStream.pipe() + readStream.pipe(res); + + readStream.on('end', () => { + // Delete the file after it's been read + fs.unlink(filePath, (err) => { + if (err) throw err; + logger.info(`${filePath} was deleted`); + }); + }); + + } catch (err) { + sysError(logger, res, err); + } +}); + module.exports = router; diff --git a/lib/swagger/swagger.yaml b/lib/swagger/swagger.yaml index 11149f29..e6463fa6 100644 --- a/lib/swagger/swagger.yaml +++ b/lib/swagger/swagger.yaml @@ -4209,6 +4209,65 @@ paths: application/json: schema: $ref: '#/components/schemas/RegisteredClient' + /Accounts/{AccountSid}/TtsCache/Synthesize: + post: + tags: + - Accounts + summary: get TTS from provider + operationId: Synthesize + requestBody: + content: + application/json: + schema: + type: object + properties: + speech_credential_sid: + type: string + description: Speech credential Sid + example: 553b4b6b-8918-4394-a46d-1e3c5a3c717b + text: + type: string + description: the text to convert to audio + example: Hello How are you + language: + type: string + description: language is used in text + example: en-US + voice: + type: string + description: voice ID + example: en-US-Standard-C + required: + - speech_credential_sid + - text + - language + - voice + responses: + 200: + description: Audio is created + content: + audio/mpeg: + schema: + type: string + format: binary + 400: + description: bad request + content: + application/json: + schema: + $ref: '#/components/schemas/GeneralError' + 422: + description: unprocessable entity + content: + application/json: + schema: + $ref: '#/components/schemas/GeneralError' + 500: + description: system error + content: + application/json: + schema: + $ref: '#/components/schemas/GeneralError' /Lcrs: post: tags: diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index 35bc0972..215e2d43 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -6,6 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node'); const bent = require('bent'); const fs = require('fs'); const { AssemblyAI } = require('assemblyai'); +const {decrypt, obscureKey} = require('./encrypt-decrypt'); const testSonioxStt = async(logger, credentials) => { @@ -287,6 +288,107 @@ const testAssemblyStt = async(logger, credentials) => { }); }; +const getSpeechCredential = (credential, logger) => { + const {vendor} = credential; + logger.info( + `Speech vendor: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} selected`); + if ('google' === vendor) { + try { + const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); + return { + ...credential, + credentials: cred + }; + } catch (err) { + logger.info({err}, `malformed google service_key provisioned for account ${credential.speech_credential_sid}`); + } + } + else if (['aws', 'polly'].includes(vendor)) { + return { + ...credential, + accessKeyId: credential.access_key_id, + secretAccessKey: credential.secret_access_key, + region: credential.aws_region || 'us-east-1' + }; + } + return credential; +}; + +function decryptCredential(obj, credential, logger, isObscureKey = true) { + if ('google' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + const key_header = '-----BEGIN PRIVATE KEY-----\n'; + const obscured = { + ...o, + private_key: `${key_header}${isObscureKey ? + obscureKey(o.private_key.slice(key_header.length, o.private_key.length)) : + o.private_key.slice(key_header.length, o.private_key.length)}` + }; + obj.service_key = JSON.stringify(obscured); + } + else if ('aws' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.access_key_id = o.access_key_id; + obj.secret_access_key = isObscureKey ? obscureKey(o.secret_access_key) : o.secret_access_key; + obj.aws_region = o.aws_region; + logger.info({obj, o}, 'retrieving aws speech credential'); + } + else if ('microsoft' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + obj.region = o.region; + obj.use_custom_tts = o.use_custom_tts; + obj.custom_tts_endpoint = o.custom_tts_endpoint; + obj.custom_tts_endpoint_url = o.custom_tts_endpoint_url; + obj.use_custom_stt = o.use_custom_stt; + obj.custom_stt_endpoint = o.custom_stt_endpoint; + obj.custom_stt_endpoint_url = o.custom_stt_endpoint_url; + logger.info({obj, o}, 'retrieving azure speech credential'); + } + else if ('wellsaid' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + } + else if ('nuance' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.client_id = o.client_id; + obj.secret = o.secret ? (isObscureKey ? obscureKey(o.secret) : o.secret) : null; + } + else if ('deepgram' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + } + else if ('ibm' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.tts_api_key = isObscureKey ? obscureKey(o.tts_api_key) : o.tts_api_key; + obj.tts_region = o.tts_region; + obj.stt_api_key = isObscureKey ? obscureKey(o.stt_api_key) : o.stt_api_key; + obj.stt_region = o.stt_region; + obj.instance_id = o.instance_id; + } else if ('nvidia' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.riva_server_uri = o.riva_server_uri; + } else if ('cobalt' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.cobalt_server_uri = o.cobalt_server_uri; + } else if ('soniox' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + } else if ('elevenlabs' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + obj.model_id = o.model_id; + } else if (obj.vendor.startsWith('custom:')) { + const o = JSON.parse(decrypt(credential)); + obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token; + obj.custom_stt_url = o.custom_stt_url; + obj.custom_tts_url = o.custom_tts_url; + } else if ('assemblyai' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + } +} + module.exports = { testGoogleTts, testGoogleStt, @@ -303,5 +405,7 @@ module.exports = { testIbmStt, testSonioxStt, testElevenlabs, - testAssemblyStt + testAssemblyStt, + getSpeechCredential, + decryptCredential }; diff --git a/package-lock.json b/package-lock.json index 97adc99e..a4d57d29 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,9 +19,9 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.5", "@jambonz/realtimedb-helpers": "^0.8.7", - "@jambonz/speech-utils": "^0.0.15", + "@jambonz/speech-utils": "^0.0.24", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.29", + "@jambonz/verb-specifications": "^0.0.45", "@soniox/soniox-node": "^1.1.1", "argon2": "^0.30.3", "assemblyai": "^3.0.1", @@ -35,7 +35,7 @@ "ibm-watson": "^7.1.2", "jsonwebtoken": "^9.0.0", "mailgun.js": "^9.1.2", - "microsoft-cognitiveservices-speech-sdk": "^1.24.1", + "microsoft-cognitiveservices-speech-sdk": "1.31.0", "mysql2": "^2.3.3", "nocache": "3.0.4", "passport": "^0.6.0", @@ -1968,11 +1968,11 @@ } }, "node_modules/@jambonz/speech-utils": { - "version": "0.0.15", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.15.tgz", - "integrity": "sha512-di8jVnSdCXzkhKrX4vS5MyALw2KS3D1kodGiral3E9NrcUXDZhiLs7vZOW4EdAS1TTykl+Rb/GwNQ8xqkufxKQ==", + "version": "0.0.24", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.24.tgz", + "integrity": "sha512-FpywxkjohC7wBBS9Xz9pRYXyO9D/tt2xgHFseWSoe0kO/6DbFudoLAI5uGB7AyUbTeGeKBiVZDJSNadsM4Iv4A==", "dependencies": { - "@aws-sdk/client-polly": "^3.303.0", + "@aws-sdk/client-polly": "^3.359.0", "@google-cloud/text-to-speech": "^4.2.1", "@grpc/grpc-js": "^1.8.13", "bent": "^7.3.12", @@ -1981,7 +1981,7 @@ "google-protobuf": "^3.21.2", "ibm-watson": "^8.0.0", "ioredis": "^5.3.2", - "microsoft-cognitiveservices-speech-sdk": "^1.26.0", + "microsoft-cognitiveservices-speech-sdk": "^1.31.0", "undici": "^5.21.0" } }, @@ -2096,9 +2096,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.29.tgz", - "integrity": "sha512-jeYI+GN7Y5nXhdFG3SXvXaBlhCjIC+l5AcBywDDGxxyuuKRTukPS0MSvCtWPZP6H3wYYGqfJ4DR/vgtBF3pvyQ==", + "version": "0.0.45", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.45.tgz", + "integrity": "sha512-0cC7cfyXuOlqjfrtA9GC7A84efInj4z+ZSsibONqHMw3FVJE5IvcvabRojarDHooIn9Uw6AEX/zZ7BZqfgVmJw==", "dependencies": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -6872,9 +6872,9 @@ } }, "node_modules/microsoft-cognitiveservices-speech-sdk": { - "version": "1.28.0", - "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.28.0.tgz", - "integrity": "sha512-d+hCqTSeVCGtog5BgUKdIVTNifuigap9VSJbtDUP4kW1uG/yp7zRnqsjYx9nV9sRfuiYwZCyFzGG+VXGa37QDw==", + "version": "1.31.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.31.0.tgz", + "integrity": "sha512-wmNi0XoGtQwRoI2To6QSrGHVW0d8WfhJwXtE2nk48l4YkBiDqdPV2tdSXFHRrdv3uwr/+THip45H91Fllpm8qA==", "dependencies": { "agent-base": "^6.0.1", "bent": "^7.3.12", @@ -11295,11 +11295,11 @@ } }, "@jambonz/speech-utils": { - "version": "0.0.15", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.15.tgz", - "integrity": "sha512-di8jVnSdCXzkhKrX4vS5MyALw2KS3D1kodGiral3E9NrcUXDZhiLs7vZOW4EdAS1TTykl+Rb/GwNQ8xqkufxKQ==", + "version": "0.0.24", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.24.tgz", + "integrity": "sha512-FpywxkjohC7wBBS9Xz9pRYXyO9D/tt2xgHFseWSoe0kO/6DbFudoLAI5uGB7AyUbTeGeKBiVZDJSNadsM4Iv4A==", "requires": { - "@aws-sdk/client-polly": "^3.303.0", + "@aws-sdk/client-polly": "^3.359.0", "@google-cloud/text-to-speech": "^4.2.1", "@grpc/grpc-js": "^1.8.13", "bent": "^7.3.12", @@ -11308,7 +11308,7 @@ "google-protobuf": "^3.21.2", "ibm-watson": "^8.0.0", "ioredis": "^5.3.2", - "microsoft-cognitiveservices-speech-sdk": "^1.26.0", + "microsoft-cognitiveservices-speech-sdk": "^1.31.0", "undici": "^5.21.0" }, "dependencies": { @@ -11414,9 +11414,9 @@ } }, "@jambonz/verb-specifications": { - "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.29.tgz", - "integrity": "sha512-jeYI+GN7Y5nXhdFG3SXvXaBlhCjIC+l5AcBywDDGxxyuuKRTukPS0MSvCtWPZP6H3wYYGqfJ4DR/vgtBF3pvyQ==", + "version": "0.0.45", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.45.tgz", + "integrity": "sha512-0cC7cfyXuOlqjfrtA9GC7A84efInj4z+ZSsibONqHMw3FVJE5IvcvabRojarDHooIn9Uw6AEX/zZ7BZqfgVmJw==", "requires": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -15124,9 +15124,9 @@ } }, "microsoft-cognitiveservices-speech-sdk": { - "version": "1.28.0", - "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.28.0.tgz", - "integrity": "sha512-d+hCqTSeVCGtog5BgUKdIVTNifuigap9VSJbtDUP4kW1uG/yp7zRnqsjYx9nV9sRfuiYwZCyFzGG+VXGa37QDw==", + "version": "1.31.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.31.0.tgz", + "integrity": "sha512-wmNi0XoGtQwRoI2To6QSrGHVW0d8WfhJwXtE2nk48l4YkBiDqdPV2tdSXFHRrdv3uwr/+THip45H91Fllpm8qA==", "requires": { "agent-base": "^6.0.1", "bent": "^7.3.12", diff --git a/package.json b/package.json index cf12e5a1..749a1854 100644 --- a/package.json +++ b/package.json @@ -29,9 +29,9 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.5", "@jambonz/realtimedb-helpers": "^0.8.7", - "@jambonz/speech-utils": "^0.0.15", + "@jambonz/speech-utils": "^0.0.24", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.29", + "@jambonz/verb-specifications": "^0.0.45", "@soniox/soniox-node": "^1.1.1", "argon2": "^0.30.3", "assemblyai": "^3.0.1", @@ -45,7 +45,7 @@ "ibm-watson": "^7.1.2", "jsonwebtoken": "^9.0.0", "mailgun.js": "^9.1.2", - "microsoft-cognitiveservices-speech-sdk": "^1.24.1", + "microsoft-cognitiveservices-speech-sdk": "1.31.0", "mysql2": "^2.3.3", "nocache": "3.0.4", "passport": "^0.6.0",